Commit 59f31252 authored by Edvard Rejthar's avatar Edvard Rejthar

#29 better aggregation

Signed-off-by: Edvard Rejthar's avatarEdvard Rejthar <edvard.rejthar@nic.cz>
parent 4c9fd222
......@@ -13,6 +13,7 @@ class Pref:
geoip = False
autoprune = True
creation_spree = False
cache = "oldcache"
@classmethod
def val2html(cls, param):
......
import json
import logging
import subprocess
from builtins import FileNotFoundError
from glob import escape
from flask import request
from peewee import IntegrityError
from mdmaug.lib.domains import domain2dir
from mdmaug.lib.model.crawl import Crawl
from .scan_controller import ScanController
from ..config import Config
from ..model.dbp import Encounter, Whitelist
from ..model.dbp import Encounter, Whitelist, Status
from ..parser.traffic_log_parser import TrafficLogParser
from ...templates.crawl_view import CrawlView
......@@ -18,34 +19,34 @@ logger = logging.getLogger("mdmaug")
class Api:
def run(self, request):
def run(self, params):
""" Accept command
:type request: dict from URL request. /api/analyze=cache/http://example.com → {"api": True, "analyze": cache, "page": "http://example.com"}
:type params: dict from URL request. /api/analyze=cache/http://example.com → {"api": True, "analyze": cache, "page": "http://example.com"}
"""
crawl = None
if "analyze" in request:
if "analyze" in params:
map_ = {"fresh": None, "cached": 1, "weekcache": 7, "oldcache": True, True: None}
if request["analyze"] in map_:
days = map_[request["analyze"]]
if params["analyze"] in map_:
days = map_[params["analyze"]]
else:
days = int(request["analyze"])
crawl = ScanController().launch(request["page"], days, request.get("autoprune") in ["y", "1", True],
request.get("creation_spree") in ["y", "1", True])
elif "aggregate" in request:
crawl = self.aggregate(request)
elif "scan" in request:
if "date" not in request:
request["date"] = ""
crawl = ScanController().get_scan(escape(request["scan"]), scan=escape(request["date"]))
elif "prune" in request:
days = int(params["analyze"])
crawl = ScanController().launch(params["page"], days, params.get("autoprune") in ["y", "1", True],
params.get("creation_spree") in ["y", "1", True])
elif "aggregate" in params:
crawl = self.aggregate(params)
elif "scan" in params:
if "date" not in params:
params["date"] = ""
crawl = ScanController().get_scan(escape(params["scan"]), scan=escape(params["date"]))
elif "prune" in params:
return ScanController.prune()
elif "nicify" in request:
return TrafficLogParser.getStylesheet() + TrafficLogParser.nicify_file(request["page"])
elif "vote" in request: # /api/vote/block/example.org/10.0.0.1
return Encounter.vote(request["vote"], request["page"])
elif "reset" in request:
elif "nicify" in params:
return TrafficLogParser.getStylesheet() + TrafficLogParser.nicify_file(params["page"])
elif "vote" in params: # /api/vote/block/example.org/10.0.0.1
return Encounter.vote(params["vote"], params["page"])
elif "reset" in params:
self.reset()
return "reset"
else:
......@@ -54,31 +55,69 @@ class Api:
if crawl:
if type(crawl) is str:
return crawl # containing an error message
elif request["api"] == "json":
elif params["api"] == "json":
return CrawlView.output_json(crawl)
else:
return CrawlView.output_html(crawl)
@staticmethod
def aggregate(request):
date_from = int(request["from"])
date_to = int(request["to"])
def aggregate(params):
date_from = int(params["from"])
date_to = int(params["to"])
crawl = Crawl()
scan_count = set()
domain_count = set()
v = params.get("order", "origins")
if v == "origins":
# XX ignores aggregation dates, @see Encounter.by_origin_count
limit = request.args.get('limit', default=params["paging"], type=int)
offset = request.args.get('offset', default=0, type=int)
vote_filter = params.get("filter", None)
if vote_filter == "-":
vote_filter = None
domains, hosts, total = Encounter.by_origin_count(limit, offset, vote_filter)
for domain, host in zip(domains, hosts):
if host in crawl: # we already have a domain that connected to the host
continue
domain = domain2dir(domain)
for scan in ScanController.get_domain_scans(domain):
c = ScanController.get_scan(domain, scan)
if host in c:
crawl += c
scan_count.add("/".join([domain, scan]))
domain_count.add(domain)
break
# filter out all the hosts that were in scans but we don't want to see them now (paging)
for domain in list(crawl.keys()):
if domain not in hosts:
del crawl[domain]
# re-sort the host by count (we might have lost sorting if a scan reported two from the hosts -> random order)
c2_ordered = Crawl()
for host in hosts:
if host in crawl:
c2_ordered[host] = crawl[host]
crawl = c2_ordered
crawl.paging = limit, offset, total
elif v == "unordered":
# scans everything
for domain, scan in ScanController.get_all_scans():
if date_from < int(scan) < date_to:
try:
scan_count.add("/".join([domain, scan]))
domain_count.add(domain)
crawl += Crawl.load_from_scan(domain, scan)
except FileNotFoundError:
logger.warning("Wrong analysis stored at %s/%s", domain, scan)
pass
else:
return "Unknown ordering. (NOT YET IMPLEMENTED for IPS and date seen)"
crawl.title = f"Merged {len(scan_count)} scans from {len(domain_count)} domains"
if not crawl:
crawl = "No scan between these dates."
crawl = "No results with this conditions."
return crawl
@staticmethod
......@@ -87,6 +126,7 @@ class Api:
with open(Config.config_file, 'w') as f: # clear the queue
json.dump({}, f)
subprocess.call(["pkill", Config.browser]) # kill frozen browsers
subprocess.call(["pkill", "Xvfb"]) # once, many Xvfb instances got open
# prida 2ld domenu mezi whitelistovane
def whitelist(self):
......
......@@ -39,7 +39,7 @@ class ScanController:
@staticmethod
def get_scan(domain, scan: str = ""):
"""
:param domain: hostname
:param domain: hostname # XX the argument is not domain, but domain2dir(domain) (or something)
:param scan: time of scan, if not specified, we return the last
:return: Crawl object
"""
......@@ -56,11 +56,17 @@ class ScanController:
@staticmethod
def get_domain_scans(domain):
"""
# XX listdir should be ordered from the most recent (or the highest in the alphabet)
# XX the argument is not domain, but domain2dir(domain) (or something)
"""
d = Config.CACHE_DIR + domain + "/"
if os.path.isdir(d):
# all possible snapshot directories
return [scan for scan in os.listdir(d) if os.path.isdir(d + scan)]
# X and os.path.isfile(d + scan + "/" + Config.CRAWL_FILE)
else:
return []
def launch(self, url, cached=None, autoprune=False, creation_spree=False):
"""
......
......@@ -4,12 +4,13 @@ import re
from html import escape
from flask import Blueprint, send_from_directory, render_template, request, make_response
from wtforms import Form, Label
from wtforms import Form, Label, SelectField, IntegerField
from wtforms.fields import BooleanField
from wtforms.widgets.html5 import NumberInput
from ..config import Config, Pref
from ..controller.api import Api
from ..model.dbp import Export
from ..model.dbp import Export, Status
# env = Environment()
# env.loader = FileSystemLoader(Config.DIR + "templates/")
......@@ -23,7 +24,7 @@ def update_preferences():
for k, v in request.cookies.items():
if v in ["0", ""]:
v = False
elif v is "1":
elif v == "1":
v = True
setattr(Pref, k, v)
# current_app.config["preferences"][k] = v
......@@ -47,7 +48,10 @@ def _generate_boolean(name, label=None, title=""):
def homepage():
update_preferences()
class OptionsForm(Form):
class AnalysisOptions(Form):
analyze = SelectField('Cache max age', default=Pref.cache,
choices=[('fresh', 'No cache'), ('cached', 'Yesterday'), ('weekcache', 'Week'),
('oldcache', 'Any cache'), ('days', 'Max days')])
safebrowsing = _generate_boolean("safebrowsing", 'Google Safebrowsing',
"Call Safebrowsing service for every object. (Slow)")
pdns = _generate_boolean("pdns", "PDNS")
......@@ -56,12 +60,18 @@ def homepage():
creation_spree = _generate_boolean("creation_spree", "Creation spree",
"If the analysis exist, do not download. (Quicker for batch analyzing.)")
if request.method == 'POST':
name = request.form['name']
return 'Hello ' + name
class AggregationOptions(Form):
order = SelectField('Order', choices=[('origins', 'origins count'), ('date', 'date seen'), ('ips', 'IP count'),
('unordered', 'unordered (slow)')])
paging = IntegerField('Paging', widget=NumberInput(max=100), default=100)
filter = SelectField('Filter', choices=[("-", "-")]+[(v, "vote: " + k) for k, v in Status.enum.items()])
form = OptionsForm()
return render_template("homepage.html", form=form, APP_HOST=Config.APP_HOST, PROFILE_COUNT=Config.profile_count)
# if request.method == 'POST':
# name = request.form['name']
# return 'Hello ' + name
return render_template("homepage.html", analysis_form=AnalysisOptions(), aggregation_form=AggregationOptions(),
APP_HOST=Config.APP_HOST, PROFILE_COUNT=Config.profile_count)
@app.route('/favicon.ico')
......
......@@ -7,6 +7,7 @@
# from yaml import CLoader as Loader, CDumper as Dumper
# except ImportError:
# from yaml import Loader, Dumper
import logging
from collections import defaultdict
import jsonpickle
......@@ -14,6 +15,7 @@ import jsonpickle
from ..config import Config
from ..parser.spy_parser import SpyParser
logger = logging.getLogger("mdmaug")
# from yaml import load, dump
......@@ -22,9 +24,13 @@ class Crawl(defaultdict):
title: None
def __add__(self, other):
c = Crawl("merged")
for domain_name, domain in self.items():
c[domain_name] = domain
"""
Due to the performance, this is not commutative and 'other' gets merged to the original object.
"""
c = self
#c = Crawl("merged") #X performance block, with 5000 domains / 60 s (which is 10 s slower)
#for domain_name, domain in self.items():
# c[domain_name] = domain
for domain_name, domain in other.items():
if domain_name not in c:
......@@ -43,9 +49,13 @@ class Crawl(defaultdict):
@staticmethod
def load_from_scan(domain, scan):
filename = Config.CACHE_DIR + domain + "/" + scan + "/" + Config.CRAWL_FILE
try:
with open(filename, 'r') as f:
# return Crawl(state=load(f.read(), Loader=Loader))
return jsonpickle.decode(f.read(), keys=True)
except FileNotFoundError:
logger.warning("Wrong analysis stored at %s/%s", domain, scan)
return {}
def __init__(self, host=None, log_dir=None, cache_dir=None, profile=None):
self.default_factory = _Domain
......
import datetime
import logging
import time
from collections import defaultdict
from peewee import Model, DateTimeField, IntegerField, CharField, JOIN, BigAutoField
from peewee import Model, DateTimeField, IntegerField, CharField, JOIN, BigAutoField, fn
from ..config import Config
from ..domains import url2domain
......@@ -64,16 +65,22 @@ class Status(DbModel):
id = BigAutoField()
name = CharField(5)
enum = {"block": 3, "log": 2, "allow": 1, "n/a": 0}
@staticmethod
def word2int(vote):
if vote == "block":
return 3
elif vote == "log":
return 2
elif vote == "allow":
return 1
else: # misto n/a se prenasi "na" (proto genericke else)
return 0
if vote in Status.enum:
return Status.enum[vote]
else:
return 0 # misto n/a se prenasi "na" (proto genericke else)
# if vote == "block":
# return 3
# elif vote == "log":
# return 2
# elif vote == "allow":
# return 1
# else: # misto n/a se prenasi "na" (proto genericke else)
# return 0
@staticmethod
def int2word(status):
......@@ -174,17 +181,97 @@ class Encounter(DbModel):
except:
return "no update, didn't find ip"
@staticmethod
def get_related(host, ignore_host=None):
@classmethod
def get_related(cls, host, ignore_host=None):
""" returns all domain 'origin' that call this remote host """
return cls.relateds[host]
l = []
q = Encounter.select(Encounter.origin).where(Encounter.host == host).group_by(Encounter.origin)
if ignore_host:
q = q.where(Encounter.origin != ignore_host)
q = cls.select(cls.origin).where(cls.host == host).group_by(cls.origin)
if ignore_host: # ignore itself
q = q.where(cls.origin != ignore_host)
for el in q:
l.append(el.origin)
return l
relateds = defaultdict(list)
@classmethod
def prepare_relateds(cls, domains, ignore_host=None):
d = {k for k in domains if k not in cls.relateds} # some might have been preloaded in by_origin_count
if not d:
return
q = "SELECT host, group_concat(distinct origin) FROM `encounter` where origin in %s group by host ORDER BY count(*) DESC"
for host, origins in Config.db.execute_sql(q, d, ):
cls.relateds[host] = [i for i in origins.split(",") if i != ignore_host]
@classmethod
def by_origin_count(cls, limit=None, offset=None, vote_filter=None):
""" Returns set of origins ordered by the count of potentially evil host connecting to them.
XX DB should have scan_timestamp :( which differs from creation+vote updated timestamp; then we could filter
by date from/to aggregation
Returns domain that seen the tracked host and the host.
"""
vv = 0
for i in Encounter.select(Encounter.origin,
Encounter.host,
fn.Group_Concat(fn.Distinct(Encounter.origin)).alias("gr")) \
.group_by(Encounter.host) \
.order_by(fn.Count("*").desc()) \
.limit(int(limit)) \
.offset(int(offset)):
# ZDEEEEEEEEE predelej to dolu a dej pak v GUI moznost regex filteru na host
#print(i.ccc, i.gr)
vv += 1
# if i.origin == "vino-magnum.cz":
# import ipdb; ipdb.set_trace()
# if i.host == "ls.hit.gemius.pl":
# import ipdb;
# ipdb.set_trace()
break
query = Encounter.select(Encounter.origin,
Encounter.host,
fn.Group_Concat(fn.Distinct(Encounter.origin)).alias("gr")) \
.group_by(Encounter.host) \
.order_by(fn.Count("*").desc())
total = query.count()
if limit:
query = query.limit(int(limit))
if offset:
query = query.offset(int(offset))
print(total, "hej")
#import ipdb; ipdb.set_trace()
if 0:
total = Config.db.execute_sql("SELECT count(distinct host) FROM `encounter`"
"").fetchone()[0]
# WHERE host not regexp '\.(cz|com|org|net)'
q = "SELECT origin, host, group_concat(distinct origin) FROM `encounter`"
if vote_filter:
q += f" WHERE status = {int(vote_filter)}"
# q += "WHERE host not regexp '\.(cz|com|org|net)'" # XXX double where
q += " group by host ORDER BY count(*) DESC"
if limit:
q += f" LIMIT {int(limit)}"
if offset:
q += f" OFFSET {int(offset)}"
domains = []
hosts = []
# for origin, host, origins in Config.db.execute_sql(q):
for origin, host, origins in query.tuples().iterator():
cls.relateds[host] = origins.split(",")
domains.append(origin)
hosts.append(host)
return domains, hosts, total
class Whitelist(DbModel):
id = BigAutoField()
......
#reply-filter input {
width: 150px;
}
[name=aggregate-dates] {
width: 300px;
}
#requests-panel .input-group.bg-success .timer {
background-color: #28a745;
color: white;
}
#analysis-result-panel > div {
margin-top: 2px;
border-top: 1px solid gray;
padding-top: 2px;
}
#analysis-result-panel .analysis {
clear: both;
}
#analysis .scans span,
#analysis-result-panel .scans span {
border: 1px dotted gray;
margin: 3px;
padding: 3px;
cursor: pointer;
}
#analysis .domain,
#analysis-result-panel .domain {
color: black;
font-weight: bold;
}
#analysis [data-vote=block] .addresses .related,
#analysis-result-panel [data-vote=block] .addresses .related {
color: white;
}
#analysis .addresses,
#analysis-result-panel .addresses {
font-size: 75%;
}
#analysis .addresses .ip,
#analysis-result-panel .addresses .ip {
color: blue;
margin-left: 5px;
}
#analysis .addresses .country,
#analysis-result-panel .addresses .country {
font-weight: bold;
}
#analysis .addresses .related,
#analysis-result-panel .addresses .related {
color: gray;
margin-left: 5px;
cursor: pointer;
}
#analysis .addresses .related span,
#analysis-result-panel .addresses .related span {
padding-left: 2px;
}
#analysis .urls,
#analysis-result-panel .urls {
margin-left: 5px;
}
#analysis .urls li,
#analysis-result-panel .urls li {
color: green;
margin: 0;
}
#analysis .urls li:hover,
#analysis-result-panel .urls li:hover {
color: #111;
}
#analysis .voting,
#analysis-result-panel .voting {
margin-right: 5px;
}
#analysis .web[data-vote=allow] .urls,
#analysis-result-panel .web[data-vote=allow] .urls,
#analysis .web[data-vote=allow] .addresses,
#analysis-result-panel .web[data-vote=allow] .addresses {
display: none;
}
#analysis .web[data-vote=block],
#analysis-result-panel .web[data-vote=block] {
background-color: #f55;
}
#analysis .web[data-vote=block] .urls p,
#analysis-result-panel .web[data-vote=block] .urls p {
color: white;
}
#analysis .web[data-vote=log],
#analysis-result-panel .web[data-vote=log] {
background-color: #cc9;
}
#analysisTemp {
clear: both;
}
.safebrowsing-status {
display: inline-block;
margin: 2px;
border: 1px dotted pink;
padding: 5px;
}
iframe .cors {
display: none;
}
#log-window {
background-color: black;
color: white;
display: none;
float: right;
margin: 5px 5px;
padding: 5px;
width: 500px;
height: 44px;
overflow: auto;
resize: vertical;
right: 0;
cursor: pointer;
z-index: 5;
}
#log-window.extended {
height: auto;
}
#log-window [role=close] {
float: right;
}
/*# sourceMappingURL=analysis-style.css.map */
\ No newline at end of file
{"version":3,"sources":["analysis-style.less"],"names":[],"mappings":"AAAA,aAAc;EACZ,YAAA;;AAGF;EACE,YAAA;;AAGF,eAAgB,aAAY,WAAY;EACtC,yBAAA;EACA,YAAA;;AAGF,sBACE;EACE,eAAA;EACA,0BAAA;EACA,gBAAA;;AAJJ,sBAME;EACE,WAAA;;AAIJ,SAIE,OACE;AAL0B,sBAI5B,OACE;EACE,uBAAA;EACA,WAAA;EACA,YAAA;EACA,eAAA;;AATN,SAYE;AAZ4B,sBAY5B;EACE,YAAA;EACA,iBAAA;;AAdJ,SAgBE,kBAAkB,WAAW;AAhBD,sBAgB5B,kBAAkB,WAAW;EAC3B,YAAA;;AAjBJ,SAmBE;AAnB4B,sBAmB5B;EACE,cAAA;;AApBJ,SAmBE,WAEE;AArB0B,sBAmB5B,WAEE;EACE,WAAA;EACA,gBAAA;;AAvBN,SAmBE,WAME;AAzB0B,sBAmB5B,WAME;EACE,iBAAA;;AA1BN,SAmBE,WASE;AA5B0B,sBAmB5B,WASE;EACE,WAAA;EACA,gBAAA;EACA,eAAA;;AA/BN,SAmBE,WASE,SAIE;AAhCwB,sBAmB5B,WASE,SAIE;EACE,iBAAA;;AAjCR,SAsCE;AAtC4B,sBAsC5B;EACE,gBAAA;;AAvCJ,SAsCE,MAEE;AAxC0B,sBAsC5B,MAEE;EAEE,YAAA;EACA,SAAA;;AACA,SANJ,MAEE,GAIG;AAAD,sBANJ,MAEE,GAIG;EACC,WAAA;;AA7CR,SAiDE;AAjD4B,sBAiD5B;EACE,iBAAA;;AAlDJ,SAoDE,KAAI,iBACF;AArD0B,sBAoD5B,KAAI,iBACF;AArDJ,SAoDE,KAAI,iBACK;AArDmB,sBAoD5B,KAAI,iBACK;EACL,aAAA;;AAtDN,SAyDE,KAAI;AAzDwB,sBAyD5B,KAAI;EACF,sBAAA;;AA1DJ,SAyDE,KAAI,iBAEF,MACA;AA5D0B,sBAyD5B,KAAI,iBAEF,MACA;EACE,YAAA;;AA7DN,SAiEE,KAAI;AAjEwB,sBAiE5B,KAAI;EACF,sBAAA;;AAIJ;EACE,WAAA;;AAGF;EACE,qBAAA;EACA,WAAA;EACA,uBAAA;EACA,YAAA;;AAGF,MAAO;EACL,aAAA;;AAGF;EACI,uBAAA;EACA,YAAA;EACA,aAAA;EACA,YAAA;EACA,eAAA;EACA,YAAA;EACA,YAAA;EACA,YAAA;EACA,cAAA;EACA,gBAAA;EAEA,QAAA;EACA,eAAA;EACA,UAAA;;AACA,WAAC;EACG,YAAA;;AAhBR,WAmBI;EACI,YAAA","file":"analysis-style.css"}
\ No newline at end of file
#reply-filter input {
width: 150px;
}
[name=aggregate-dates] {
width: 300px;
}
#requests-panel .input-group.bg-success .timer {
background-color: rgb(40, 167, 69);
color: white;
}
#analysis-result-panel {
> div {
margin-top: 2px;
border-top: 1px solid gray;
padding-top: 2px;
}
.analysis {
clear: both;
}
}
#analysis /* MDM frontend */, #analysis-result-panel /* MDMaug frontend */ {
.screenshot {
//float: right;
}
.scans {
span {
border: 1px dotted gray;
margin: 3px;
padding: 3px;
cursor: pointer;
}
}
.domain {
color: black;
font-weight: bold;
}
[data-vote=block] .addresses .related {
color: white;
}
.addresses {
font-size: 75%;
.ip {
color: blue;
margin-left: 5px;
}
.country {
font-weight: bold;
}
.related {
color: gray;
margin-left: 5px;
cursor: pointer;
span {
padding-left: 2px;
}
}
}
.urls {
margin-left: 5px;
li {
//cursor: pointer;
color: green;
margin: 0;
&:hover {
color: #111;
}
}
}
.voting {
margin-right: 5px;
}
.web[data-vote=allow] {
.urls, .addresses {
display: none;
}
}
.web[data-vote=block] {
background-color: #f55;
.urls
p {
color: white;
}
}
.web[data-vote=log] {
background-color: #cc9;
}
}
#analysisTemp {
clear: both; // floating screenshot wont overflow
}
.safebrowsing-status {
display: inline-block;
margin: 2px;
border: 1px dotted pink;
padding: 5px;
}
iframe .cors {
display: none
}
#log-window {
background-color: black;
color: white;
display:none;
float:right;
margin:5px 5px;
padding: 5px;
width: 500px;
height: 44px;
overflow: auto;
resize: vertical;
//position:absolute;
right:0;
cursor: pointer;
z-index:5;
&.extended {
height: auto;
}
[role=close] {
float:right;
}
}
\ No newline at end of file
This diff is collapsed.
......@@ -16,7 +16,30 @@ const RELATED_SELECTOR = ".analysis > form > [data-group] > .web > .addresses >
const SCAN_SELECTOR = ".analysis > form > .scans > span";
//const APP_HOST = must be defined before // "https://217.31.202.41:8000";
var $analysis_panel;
var $analysis;
//var $analysis;
var KEY = {
DOWN: 40,
UP: 38,
LEFT: 37,