Commit f7bd1155 authored by Edvard Rejthar's avatar Edvard Rejthar

package format, installation steps

parent e24ff7f7
.idea
__pycache__
*.pem
\ No newline at end of file
......@@ -9,7 +9,7 @@ PROFILE_COUNT=21
apt install software-properties-common
add-apt-repository "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc) main universe restricted multiverse"
apt update
apt install firefox python3 mariadb-server
apt install firefox python3 mariadb-server xvfb
pip3 install xvfbwrapper pymysql peewee jinja2 pyyaml bs4 pygments pillow requests
# current dir
......@@ -19,7 +19,7 @@ cd $DIR
# mariadb setup
systemctl start mariadb.service
mysql -u root < mdmaug-installation.sql # populate db
mysql -uroot -e "CREATE USER 'mdmaug'@'localhost' IDENTIFIED BY 'fidFDSs676'; GRANT ALL PRIVILEGES ON mdmaug. * TO 'mdmaug'@'%';" # new user
mysql -uroot -e "CREATE USER 'mdmaug'@'localhost' IDENTIFIED BY 'fidFDSs676'; GRANT ALL PRIVILEGES ON mdmaug. * TO 'mdmaug'@'localhost';" # new user
# adding user the server will be run under
useradd -m -d $DESTINATION mdmaug
......@@ -41,8 +41,11 @@ do
fi
done
# adopt all files to the new user
chown mdmaug:mdmaug -R $DESTINATION
# make the new user able to use the display (needed on Ubuntu 17.10 at least)
xhost +local:mdmaug
......@@ -4,10 +4,13 @@ Scans a website for a sign of a parasite hosts or commands.
## Installation
1. ```git clone git@gitlab.labs.nic.cz:csirt/mdmaug.git /tmp/mdmaug```
2. edit mdmaug/lib/config.py
3. you should generate certificate `openssl req -new -x509 -keyout cert-mdmaug.pem -out cert-mdmaug.pem -days 365 -nodes` to `mdmaug/cert-mdmaug.pem`
4. ```/tmp/mdmaug/INSTALL```
1. Download ```git clone git@gitlab.labs.nic.cz:csirt/mdmaug.git /tmp/mdmaug```
2. Edit mdmaug/lib/config.py
3. You should generate certificate `openssl req -new -x509 -keyout cert-mdmaug.pem -out cert-mdmaug.pem -days 365 -nodes` to `mdmaug/cert-mdmaug.pem`
4. Perform installation: ```/tmp/mdmaug/INSTALL```
5. Everything should be located in `/opt/mdmaug`.
6. Launch under newly created `mdmaug` user: `su - mdmaug -c 'python3 -m mdmaug'`
7. Connect in the browser at: https://localhost:8000
### Notes
......@@ -15,6 +18,7 @@ Scans a website for a sign of a parasite hosts or commands.
* Certificate error: Make sure that the browser doesn't blockt the MDM-Augmented server if used from MDM.
* If you want other count of profiles than 21, change INSTALL + config.py + profiles.ini
* You may put ```03 1,7,13,19 * * * ~/mdmaug-launch``` in ```crontab -e``` of user mdmaug.
* We are using Python3.6+
## What is done to Firefox profiles?
......
/__pycache__/
/lib/__pycache__/
/lib/analysis/__pycache__/
/lib/analysis/parser/__pycache__/
/nbproject/
/templates/__pycache__/
cert-mdmaug.pem
#!/usr/bin/env python3
import logging
import os
import ssl
import threading
import logging
logging.basicConfig(level=logging.DEBUG, format="%(message)s")
from http.server import HTTPServer
from xvfbwrapper import Xvfb
from lib.config import Config
from lib.controller.server import Server
from lib.controller.api import Api
# import ipdb; ipdb.set_trace()
#logging.basicConfig(level=logging.WARNING, format="%(levelname)s: %(message)s",filename="logger.log")
from .lib.config import Config
from .lib.controller.server import Server
from .lib.controller.api import Api
# assure the logging dir
if not os.path.exists(Config.LOG_DIR):
os.makedirs(Config.LOG_DIR)
# setup multithreading server
# server setup
Api.reset()
httpd = HTTPServer(('0.0.0.0', Config.APP_PORT), Server)
address = '0.0.0.0'
httpd = HTTPServer((address, Config.APP_PORT), Server)
httpd.socket = ssl.wrap_socket(httpd.socket,
server_side=True,
certfile= Config.DIR + 'python.pem', # together private + cert, http://stackoverflow.com/questions/19705785/python-3-https-webserver
# together private + cert, http://stackoverflow.com/questions/19705785/python-3-https-webserver
certfile=Config.DIR + 'cert-mdmaug.pem',
ssl_version=ssl.PROTOCOL_TLSv1)
vdisplay = Xvfb()
vdisplay.start()
display = Xvfb()
display.start()
try:
print('Listening at https://0.0.0.0:{}'.format(Config.APP_PORT))
print(f'Listening at https://{address}:{Config.APP_PORT}')
for _ in range(Config.profileCount):
threading.Thread(target=httpd.serve_forever).start()
except (KeyboardInterrupt, SystemExit):
vdisplay.stop()
display.stop()
'''
XX TO BE DELETED:
How to debug mysql:
conn = pymysql.connect(host='localhost', user='root', passwd='lopuch', db='mdmaug', charset='utf8')
cur = conn.cursor()
......@@ -55,4 +57,4 @@ quit()
#from urllib.parse import parse_qs
#from urllib.parse import urlparse
#quit()
'''
\ No newline at end of file
'''
/mnt/mdmaug/home/mdmaug/.mozilla/extensions/{ec8030f7-c20a-464f-9b0e-13a3a9e97384}/mdmaug@jetpack/resources/mdmaug/
\ No newline at end of file
import threading
import os
import logging
from glob import glob
import os
import threading
from peewee import MySQLDatabase
class Config:
profileCount = 21 # pocet profilu vytvorenych ve firefoxu. Tyto je treba vytvorit rucne. Nazev profilu je cislo - 0,1...
browser = 'firefox' # iceweasel, firefox. Ktery prohlizec se spousti.
configFile = '/opt/mdmaug/.cache/mdmaug-scans/_tmp/queue.cache' # RAM disk byl maly: '/tmp/mdm/queue.cache'
profileCount = 21 # number of Firefox profiles. Its name is just a number – 0,1...
browser = 'firefox' # iceweasel, firefox. What browser gets launched.
configFile = '/opt/mdmaug/.cache/mdmaug-scans/_tmp/queue.cache' # RAM disk was too small: '/tmp/mdm/queue.cache'
APP_PORT = 8000
APP_DOMAIN = 'https://217.31.202.41:' + str(APP_PORT) #csirt.csirt.office.nic.cz
LOG_DIR = "/opt/mdmaug/.cache/mdmaug-scans/_tmp/" # X /tmp/mdm/
CACHE_DIR = "/opt/mdmaug/.cache/mdmaug-scans/"
APP_DOMAIN = 'https://217.31.202.41:' + str(APP_PORT) # csirt.csirt.office.nic.cz
LOG_DIR = "/opt/mdmaug/.cache/mdmaug-scans/_tmp/"
CACHE_DIR = "/opt/mdmaug/.cache/mdmaug-scans/"
DIR = os.path.dirname(os.path.realpath(__file__)) + "/../"
myDB = ""
lock = threading.RLock() # doufam, ze kdyz je lock tady, ze je funknci. Closure...? XX nejak otestovat
myDB: None
lock = threading.RLock() # doufam, ze kdyz je lock tady, ze je funknci. Closure...? XX nejak otestovat
THUMBNAIL_SIZE = 640, 640
MAX_WHOIS_DOMAIN_THREADS = 10 # spusti maximalne 10 threadu doraz, jednou mi to totiz preteklo (kazda domena spusti jeste tolik threadu, kolik ma IP, ale tech byva jen par)
MAX_BROWSER_RUN_TIME = 25 # maximalni cas, ktery muze browser bezet
MAX_BROWSER_EXPIRATION = 15 # pocet vterin, ktere muzeme max cekat, nez se browser zavre (trva, nez zapise soubory)
MAX_WHOIS_DOMAIN_THREADS = 10 # spusti maximalne 10 threadu doraz, jednou mi to totiz preteklo (kazda domena spusti jeste tolik threadu, kolik ma IP, ale tech byva jen par)
MAX_BROWSER_RUN_TIME = 25 # maximum time for a browser to run
MAX_BROWSER_EXPIRATION = 15 # seconds that we wait before killing the browser (waiting for the files to be written)
def connect():
# XX resim problem peewee.OperationalError: (2006, "MySQL server has gone away (BrokenPipeError(32, 'Broken pipe'))") po 7 hodinach timeoutu
# XX kupodivu pripojeni nemuze byt v dbp DBModel.connect. Prestoze type je pak spravne (MySQLDatabase), nic udelat nejde a pokusy o select konci NoneType.
logging.debug("Connecting to DB.")
Config.myDB = MySQLDatabase("mdmaug", host='localhost', port=3306, user="mdmaug", passwd="fidFDSs676") # XX dal jsem pryc: , threadlocals=False
Config.myDB.register_fields({'primary_key': 'BIGINT AUTOINCREMENT'})
Config.myDB = MySQLDatabase("mdmaug", host='localhost', port=3306, user="mdmaug",
passwd="fidFDSs676") # XX dal jsem pryc: , threadlocals=False
Config.connect()
import json
import subprocess
import logging
from lib.config import Config
from lib.controller.scan_controller import ScanController
from lib.model.dbp import Status, Export, Turris, Whitelist
from lib.analysis.parser.traffic_log_parser import TrafficLogParser
import subprocess
from peewee import IntegrityError
from ...templates.crawl_view import CrawlView
from .scan_controller import ScanController
from ..config import Config
from ..model.dbp import Turris, Whitelist
from ..parser.traffic_log_parser import TrafficLogParser
class Api:
website = "" # http://site.cz
websiteDomain = "" # site.cz
website = "" # http://site.cz
websiteDomain = "" # site.cz
def __init__(self, path):
self.path = path
def run(self, request):
""" Accept command
:type path: dict from URL request. /api/analyze=cache/http://example.com → {"api": True, "analyze": cache, "page": "http://example.com"}
"""
def run(self, cmd):
""" Accept command """
if cmd == "analyze":
return ScanController().launch(self.path)
if cmd == "analyze=cached":
return ScanController().launch(self.path, cached = 1)
if cmd == "analyze=weekcache":
return ScanController().launch(self.path, cached = 7)
if cmd == "analyze=oldcache":
return ScanController().launch(self.path, cached = True)
elif cmd == "export=view": # XX deprecated?
return Export.exportView()
elif cmd == "export=confirm": # XX deprecated?
return Export.exportConfirm()
elif cmd == "decide": # XX deprecated?
return self.getUndecided()
elif cmd == "nicify":
url = self.path.split("/", 3)
return TrafficLogParser.getStylesheet() + TrafficLogParser.nicifyFile(url[3])
elif cmd == "vote": # /api/vote/block/example.org/10.0.0.1
if "analyze" in request:
crawl = ScanController().launch(request["page"], {"cached": 1, "weekcache":7, "oldcache": True, True: None}[request["analyze"]])
if request["api"] == "json":
return CrawlView.output_json(crawl)
else:
return CrawlView.output_html(crawl)
elif "decide" in request: # XX deprecated?
return self.get_undecided()
elif "nicify" in request:
return TrafficLogParser.getStylesheet() + TrafficLogParser.nicifyFile(request["page"])
elif "vote" in request: # /api/vote/block/example.org/10.0.0.1
logging.debug("vote cmd")
url = self.path.split("/", 4)
logging.debug(url[3])
return Turris.vote(url[3], url[4])
elif cmd == "whitelist": # XXX not implemented yet
url = self.path.split("/", 3)
return Turris.vote(request["vote"], request["page"])
elif "whitelist" in request: # XXX not implemented yet
"""url = path.split("/", 3)
if len(url) > 3:
self._setWebsite(url[2]) # osetrit self.website, ze je URL, a nikoli shell
logging.debug("XXX nejsem si jist, zda url je spravne na url[2]") # XXX
logging.debug(url) # XXX
quit() # XXX
self._setWebsite(url[2]) # osetrit self.website, ze je URL, a nikoli shell
logging.debug("XXX nejsem si jist, zda url je spravne na url[2]") # XXX
logging.debug(url) # XXX
quit() # XXX
logging.debug(self.website)
logging.debug(self.websiteDomain)
return self.whitelist()
elif cmd == "reset":
Server.reset()
return self.whitelist()"""
return "Implement first if needed."
elif "reset" in request:
self.reset()
return "reset"
@staticmethod
def reset():
logging.debug("resetting running browsers")
with open(Config.configFile, 'w') as f: # clear the queue
with open(Config.configFile, 'w') as f: # clear the queue
json.dump({}, f)
subprocess.call(["pkill", Config.browser]) # kill frozen browsers
subprocess.call(["pkill", Config.browser]) # kill frozen browsers
#prida 2ld domenu mezi whitelistovane
# prida 2ld domenu mezi whitelistovane
def whitelist(self):
logging.debug("whitelistuju")
#Db.cur = Db.connection.cursor()
#self._logging.debug(Db.cur.execute("""REPLACE INTO whitelist set domain = %s""", (self.websiteDomain, )))
#Db.connection.commit()
#Db.cur.close()
try:Whitelist.insert(domain=self.websiteDomain).execute()
except IntegrityError:pass # jiz je vlozeno
# Db.cur = Db.connection.cursor()
# self._logging.debug(Db.cur.execute("""REPLACE INTO whitelist set domain = %s""", (self.websiteDomain, )))
# Db.connection.commit()
# Db.cur.close()
try:
Whitelist.insert(domain=self.websiteDomain).execute()
except IntegrityError:
pass # jiz je vlozeno
def getUndecided(self):
@staticmethod
def get_undecided():
logging.debug("XXX jeste jsem neudelal - ma vylezt tabulka vsech nerozhodlych domen od posledniho exportu")
pass
\ No newline at end of file
pass
This diff is collapsed.
from http.server import SimpleHTTPRequestHandler
from jinja2 import Environment
from jinja2 import FileSystemLoader
from lib.config import Config
from lib.controller.api import Api
from lib.model.dbp import DbModel
from lib.model.dbp import Export
import logging
import mimetypes
import os
from http.server import SimpleHTTPRequestHandler
from jinja2 import Environment
from jinja2 import FileSystemLoader
from ..config import Config
from ..controller.api import Api
from ..model.dbp import DbModel
from ..model.dbp import Export
env = Environment()
env.loader = FileSystemLoader(Config.DIR + "templates/")
class Server(SimpleHTTPRequestHandler):
def favicon(self):
with open('favicon.ico', 'rb') as f:
self.output(f.read(), "image/x-icon")
def render_template(self, filename, ** kwargs):
def render_template(self, filename, **kwargs):
self.output(env.get_template(filename).render(kwargs))
def output(self, contents, contentType="text/html"):
def output(self, contents, content_type="text/html"):
self.send_response(200)
self.send_header("Content-type", contentType)
self.send_header("Content-type", content_type)
self.end_headers()
try:
self.wfile.write(contents)
......@@ -34,26 +37,61 @@ class Server(SimpleHTTPRequestHandler):
self.render_template("homepage.html")
def static_file(self, url):
is_binary_string = lambda bytes: bool(bytes.translate(None, bytearray([7, 8, 9, 10, 12, 13, 27]) + bytearray(range(0x20, 0x100))))
type = 'rb' if is_binary_string(open('/usr/bin/python', 'rb').read(1024)) else 'r'
with open(url, type) as f:
self.output(f.read(), contentType=mimetypes.guess_type(url))
# 'rb' if is a binary string, else 'r'
type_ = 'rb' if bool(open('/usr/bin/python', 'rb').read(1024).translate(None,
bytearray([7, 8, 9, 10, 12, 13, 27]) + bytearray(
range(0x20, 0x100)))) else 'r'
with open(url, type_) as f:
self.output(f.read(), content_type=mimetypes.guess_type(url))
def do_GET(self):
path = self.path.split("/")
"""
Routing table:
/ → homepage
/existing-file → return the static file from /static
/(destination=example.com/)api... → if set, the output will be HTML5-postMessaged to other tab at the destination (with https protocol)
/api(=json)/ → output might be either in JSON, or else in HTML
/api/analyze(=...)/URI
/api/vote/...
/api/reset
/export/(days) → CSV of last X days
"""
_, path = self.path.split("/", 1)
path, *_ = path.split("?", 1)
logging.debug("Request: {}".format(path[1]))
if path[1] == "":
if path == "":
return self.homepage()
elif os.path.isfile(Config.DIR + "static/" + path[1]): #faviconka, nebo jiny existujici soubor
return self.static_file(Config.DIR + "static/" + path[1])
elif os.path.isfile(Config.DIR + "static/" + path): # favicon or any other existing file
return self.static_file(Config.DIR + "static/" + path)
DbModel.assureConnection()
if path[1] == "api": # /api/analyze/web
cmd = path[2]
api = Api(self.path)
# send everything up, we are in an iframe
self.render_template("_message.html", contents=api.run(cmd), cmd=cmd, url=self.path, destination="https://mdm.nic.cz/")
elif path[1] == "export": # /export/{days} - csv za poslednich 7 dni
# parse the request url into a friendly dictionary
request = {"page": ""}
page = False
for l in self.path.split("/")[1:]:
if not page:
c, *d = l.split("=", 1)
if c in ["http:", "https:"]:
page = True
else:
request[c] = d[0] if len(d) else True
continue
request["page"] += l + "/"
if request["page"]: # strip last slash
request["page"] = request["page"][:-1]
logging.debug("Request: {}".format(request))
if "api" in request: # /api/analyze/web
output = Api().run(request)
if "destination" in request:
# send everything up, we are in an iframe
self.render_template("_message.html", contents=output, cmd=request, url=self.path,
destination=f"https://{request['destination']}/")
else:
self.output(output)
elif "export" in request: # /export/{days} - csv of last 7 days
url = self.path.split("/", 2)
self.output(Export.exportView(days=url[2]))
\ No newline at end of file
self.output(Export.export_view(days=url[2]))
import re
import socket
import logging
import re
from urllib.parse import parse_qs
from urllib.parse import urlparse
import socket
import urllib.request
class Domains:
""" webove nastroje """
def get_ips_for_host(host):
try:
ips = socket.getaddrinfo(host, 80, 0, 0, socket.IPPROTO_TCP) # XXX co kdyz nepratelsky web reaguje jen na 80, 81, 8080
ips = socket.getaddrinfo(host, 80, 0, 0, socket.IPPROTO_TCP) # XXX co kdyz nepratelsky web reaguje jen na 80, 81, 8080
except socket.gaierror:
ips = []
return ips
......@@ -23,53 +21,57 @@ class Domains:
except TypeError:
logging.debug("Domains/url2domain type error")
logging.debug(url)
raise #return ""
raise # return ""
def url2path(url):
""" http://seznam.cz/url -> /url """
url = re.sub('^(http://|https://|ftp://)', '', url) # odstrihnout protokol
url = re.sub('^([^/])*', '', url) # stojim jen cestu, ne o domenu
url = re.sub('^(http://|https://|ftp://)', '', url) # odstrihnout protokol
url = re.sub('^([^/])*', '', url) # stojim jen cestu, ne o domenu
return url
def assureUrl(url): # zajistit, ze se jedna o url a ne treba o shell
# XX co ostatni protokoly? smb, sftp? Hrozi tam nejake nebezpeci?
return re.search('(((((http|https|ftp)://)?[\w\-_]+(?:(?:\.[\w\-_]+)+)))([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?)', url).group(0)
def assureUrl(url): # zajistit, ze se jedna o url a ne treba o shell
# XX what about other protocols? smb, sftp? Is there a danger?
return re.search(
'(((((http|https|ftp)://)?[\w\-_]+(?:(?:\.[\w\-_]+)+)))([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?)',
url).group(0)
def domain2dir(url): # friendly nazev adresare z domeny v url
def domain2dir(url): # friendly nazev adresare z domeny v url
url = url.lower()
url = re.sub('^(http://|https://|ftp://)', '', url) # odstrihnout protokol
url = re.sub('(/.*)', '', url) # nestojim o cestu, jen o domene
url = re.sub('[^a-z0-9\.]', '', url) # nechat jen pratelske znaky
url = re.sub('^(http://|https://|ftp://)', '', url) # odstrihnout protokol
url = re.sub('(/.*)', '', url) # nestojim o cestu, jen o domene
url = re.sub('[^a-z0-9\.]', '', url) # nechat jen pratelske znaky
return url
def getPdnsLink(ip):
return 'http://pdns.cert.at/p/dns?qry=' + ip
def ip2pdnsDomains(ip):
return None #24 doesnt work
"""
try:
# XX mohl bych sortovat dle 2nd domeny. Pripadne oriznout 3rd domenu, nechat jen 2nd. Tam ale musim osetrit problemove dvojite tld - co.uk, gov.ua...
pdns = urllib.request.urlopen(Domains.getPdnsLink(ip)).read().decode("utf-8")
items = re.findall("<div class='x[BA]'>(.*)</div>", pdns)
items = re.findall("<div class='x[BA]'>(.*)</div>", pdns)
return items
except Exception as e:
logging.debug("chyba pri kontaktu s PDNS: " + str(e))
return None
"""
return None # #24 doesnt work
def ip2countryAndCity(ip):
return None, None #23 service down
"""
try:
hostipApi = urllib.request.urlopen('http://api.hostip.info/get_html.php?ip=' + ip + '&position=true').read().decode("utf-8").split("\n")
#['Country: CZECH REPUBLIC (CZ)', 'City: Prague', '', 'Latitude: 50.0833', 'Longitude: 14.4333', 'IP: 109.123.209.188', '']
hostipApi = urllib.request.urlopen('http://api.hostip.info/get_html.php?ip=' + ip + '&position=true').read().decode(
"utf-8").split("\n")
# ['Country: CZECH REPUBLIC (CZ)', 'City: Prague', '', 'Latitude: 50.0833', 'Longitude: 14.4333', 'IP: 109.123.209.188', '']
return hostipApi[0].split(":")[1], hostipApi[1].split(":")[1]
except UnicodeDecodeError: # as e
except UnicodeDecodeError: # as e
logging.debug("neumim dekodovat")
except Exception as e:
logging.debug("hostip.info down: " + str(e))
return None, None
"""
return None, None # #23 service down
##
# Kontaktuje sluzbu safebrowsing a snazi se z jejich nekonzistentnich udaju vycist, zda kdyz na URL clovek pristoupi, objevi se cervena stranka.
......@@ -77,17 +79,17 @@ class Domains:
# Taky je mozne, ze sluzba zmenila wording. Mnoho zdaru!
#
# @param format 'bool' Vraci bool True/False/None, nebo 'attr' vraci int "1"/"0"/"" pro atribut
def isSuspicious(domain, output='bool'):
#contents = urllib.request.urlopen('http://www.google.com/safebrowsing/diagnostic?site=' + domain).read().decode("utf-8")
#with open("debugsf.tmp","a") as f:
def is_suspicious(domain, output='bool'):
# contents = urllib.request.urlopen('http://www.google.com/safebrowsing/diagnostic?site=' + domain).read().decode("utf-8")
# with open("debugsf.tmp","a") as f:
# f.write(contents + "\n\n")
#if "Site is listed as suspicious" in contents:
#elif "This site is not currently listed as suspicious." in contents:
import requests, re, json
# if "Site is listed as suspicious" in contents:
# elif "This site is not currently listed as suspicious." in contents:
import requests
r = requests.get("http://www.google.com/safebrowsing/diagnostic?output=jsonp&site=" + domain, timeout=5)
if '"listed"' in r.text:
return True if output == 'bool' else "1"
if '"unlisted"' in r.text: # vratilo to alespon neco rozumneho
if '"unlisted"' in r.text: # vratilo to alespon neco rozumneho
return False if output == 'bool' else "0"
else:
return None if output == 'bool' else ""
\ No newline at end of file
return None if output == 'bool' else ""
from yaml import load, dump
from ..config import Config
from ..parser.spy_parser import SpyParser
try:
from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
from yaml import Loader, Dumper
from collections import defaultdict
from lib.config import Config
class Crawl(defaultdict):
""" Objekt Crawl udružuje výsledky vyhledávání """
def __str__(self):
r = "Výsledky vyhledávání - navštíveno {} domén".format(len(self))
for key in self.keys():
r += "\n* " + key + " " + str(self[key])
return r
def saveToFile(self,filename):
def save_to_file(self, filename):
with open(filename, "w") as f:
f.write(dump(self.__getstate__(), Dumper=Dumper))
def loadFromFile(filename):
@staticmethod
def load_from_file(filename):
with open(filename, 'r') as f:
return Crawl(state = load(f.read(), Loader=Loader))
return Crawl(state=load(f.read(), Loader=Loader))
def __init__(self, host = None, state = None, logDir = None, cacheDir = None):
def __init__(self, host=None, state=None, log_dir=None, cache_dir=None):
""" State muze obsahovat vystup __getstate__() (serializace YAMLem) """
self.default_factory = _Domain
self.screenfile = None # HTML output XXX
if host:
self.host = host
if logDir:
self.logDir = logDir
if cacheDir:
self.cacheDir = cacheDir
self.screenfile = None # HTML output XXX
# if host:
self.host = host
# if log_dir:
self.logDir = log_dir
# if cache_dir:
self.cacheDir = cache_dir
if state:
self.__setstate__(state)
pass
def __getstate__(self):
state = self.__dict__.copy()
state["keys"] = [[x for x in (key, self[key].__getstate__())] for key in self.keys()]
return state
def __setstate__(self,state):
#logging.debug("fdsfsfds",self.__dict__)
def __setstate__(self, state):
for tup in state["keys"]:
key, val = tup
self[key].__setstate__(val)
del state["keys"]
self.__dict__ = state
self.__dict__ = state
class _Domain(defaultdict):
""" Navstivena domena behem crawlu """
def __str__(self):
r = "{} adres a {} url".format(len(self.addresses), len(self.urls))
for key in self.urls.keys():
......@@ -71,8 +73,8 @@ class _Domain(defaultdict):
return r
def __init__(self):
#self.vote = None
#self.urls = set()
# self.vote = None
# self.urls = set()
self.urls = defaultdict(_Url)
self.addresses = defaultdict(_Address)
self.pdns = set()
......@@ -82,8 +84,6 @@ class _Domain(defaultdict):
state["addresses"] = [[x for x in (key, self.addresses[key].__dict__)] for key in self.addresses]
state["urls"] = [[x for x in (key, self.urls[key].__dict__)] for key in self.urls]
return state
#return {'urls': self.urls, 'vote':self.vote, 'addresses':
# [[x for x in (key, self.addresses[key].__dict__)] for key in self.addresses]}
def __setstate__(self, state):
for tup in state["addresses"]:
......@@ -99,50 +99,52 @@ class _Domain(defaultdict):
for key in state:
self.__dict__[key] = state[key]
return
class _Url(set):
""" Unikatni navstivena url """
def __str__(self):
return str(self.__dict__)
#return "spyfile {} {}, vote {}".format(self.spyfile, self.city, self.vote)
def __init__(self):
self.spyfile = [] # cesta k souboru se podezrelym kodem poustenym strankou
self.sourcefile = [] # cesta k souboru se zdrojovym kodem. Muze jich byt vice, http://seznam.cz/index.htm a https://seznam.cz/index.htm jsou oba pod domenou seznam.cz
# paths to files with a suspicious code, run by the inspected page
self.spyfiles = []
# paths to files with source codes.
# Both 'http://example.com/index.htm' and 'https://example.com/index.htm' are under example.com domain.
self.sourcefiles = []
def add_sourcefile(self, sourcefile):
self.sourcefiles.append(sourcefile)
def addSourcefile(self, sourcefile):
#if self.sourcefile != None:
# raise RuntimeError('sourcefile', 'uz bylo definovano ' + self.sourcefile) # na tohle nejsme pripraveni - asi funkce v main.js pro jmeno souboru je spatna, protoze je jina od Domains.funkce
self.sourcefile.append(sourcefile)
def add_spyfile(self, spyfile):
self.spyfiles.append(spyfile)
def addSpyfile(self, spyfile):
#if self.spyfile != None:
# raise RuntimeError('spyfile', 'uz bylo definovano') # na tohle nejsme pripraveni - asi funkce v main.js pro jmeno souboru je spatna, protoze je jina od Domains.funkce
self.spyfile.append(spyfile)
def list_spyfiles(self):
for file in self.spyfiles:
text, shorten = SpyParser.get_short(file)
yield file, text, shorten, Config.APP_DOMAIN + "/api/nicify/" + file
def list_sourcefiles(self):
for file in self.sourcefiles:
yield Config.APP_DOMAIN + "/api/nicify/" + file
class _Address(set):
""" Adresa navstivene domeny """
#def __getstate__(self):
# return {self.country,self.city, self.vote}
#def __setstate__(self):
# pass
def __str__(self):
return "K adrese je připojena geolokace {} {}".format(self.country, self.city) # , vote {} , self.vote
return "K adrese je připojena geolokace {} {}".format(self.country, self.city) # , vote {} , self.vote