traffic_log_parser.py 6.71 KB
Newer Older
1 2
# Zpracuje log z firefoxu. Ten v tmp necha .tmp fily s html a js a screenshot.
import io
3
import logging
4
from contextlib import redirect_stdout
5
from html import escape
6
from os import listdir
7
from os.path import isfile, join, splitext
8

9
from bs4 import BeautifulSoup
10 11 12
from pygments import highlight
from pygments.formatters import HtmlFormatter
from pygments.lexers import HtmlLexer
13

Edvard Rejthar's avatar
Edvard Rejthar committed
14
from ..domains import url2path, url2domain
15
from ..model.dbp import Whitelist  # Status, Export, Encounter,
16 17 18 19 20
# from pygments.lexers import JavascriptLexer
# from pygments.styles import get_style_by_name
# import jsbeautifier
from ..parser.mdmaug_js_lexer import MdmaugJsLexer

Edvard Rejthar's avatar
Edvard Rejthar committed
21
logger = logging.getLogger("mdmaug")
22

23

24 25 26 27 28 29
class TrafficLogParser:
    """ Obohatit vysledky vyhledavani (objekt crawl) o odkazy na hezke vypisy zdrojovych kodu """

    def __init__(self, crawl):
        """ Projde soubory se zdrojovymi kody a pripoji je ke crawlu """
        # log files
30
        for file in [f for f in listdir(crawl.cache_dir) if isfile(join(crawl.cache_dir, f)) and splitext(f)[1] == ".tmp"]:
31
            # if file in ('screenshot_base64.txt', 'screenshot_debug.html'): continue
Edvard Rejthar's avatar
Edvard Rejthar committed
32
            # logger.debug(file)
33
            path = crawl.cache_dir + file
Edvard Rejthar's avatar
Edvard Rejthar committed
34
            with open(path, 'r', encoding="utf-8") as f:
Edvard Rejthar's avatar
Edvard Rejthar committed
35
                # logger.debug("traffic %s", path)
36 37
                mime = ""
                try:
38 39
                    url = f.readline().rstrip().split(" ", 1)  # prvni radek obsahuje url a mime-type dat
                    if len(url) == 1 and url[0] == "":
40
                        raise ValueError
41
                    if len(url) == 2:
42
                        url, mime = url
43
                except ValueError:  # v souboru chybi hlavicka, asi byl zabit prohlizec, co to psal, preskocit
Edvard Rejthar's avatar
Edvard Rejthar committed
44
                    logger.debug(f"({crawl.profile}) no contents fetched")
45 46 47 48 49
                    continue

                # kdyz je domena whitelistovana, preskocime ji                
                if (Whitelist.matches(url)):
                    continue
50

Edvard Rejthar's avatar
Edvard Rejthar committed
51
                # logger.debug(Domains.url2domain(url), Domains.url2path(url), path)
52
                o = crawl[url2domain(url)].urls[url2path(url)]
Edvard Rejthar's avatar
Edvard Rejthar committed
53 54 55 56 57
                try:
                    if f.readline() != "":  # some content has been fetched
                        o.sourcefiles.append(path)
                except:
                    import ipdb; ipdb.set_trace()
58

Edvard Rejthar's avatar
Edvard Rejthar committed
59 60
    @staticmethod
    def nicify_file(sourcefile):
61
        """ Returns nicified output of a .tmp file containing the source codes """
62 63
        nice_file = sourcefile + ".htm"
        if isfile(nice_file):
Edvard Rejthar's avatar
Edvard Rejthar committed
64
            with open(nice_file, "r", encoding="utf-8") as f:
65 66
                return f.read()
        else:
Edvard Rejthar's avatar
Edvard Rejthar committed
67
            with open(sourcefile, 'r', encoding="utf-8") as f:
68 69 70
                type_ = ""
                # first row is URL a type of stream (since Webextensions no more mime type)
                url = f.readline().rstrip().split(" ", 1)
71
                if len(url) == 2:
72
                    url, type_ = url
73

74
                contents = f.read()  # the rest of the file contains source data
75

76 77 78
                if contents == "":
                    return "error: no content fetched"

79
                buf = io.StringIO()
80
                with redirect_stdout(buf):  # print -> promenna
81
                    print(f"<h1>{url}</h1>")
82
                    if "main_frame" in type_ or "sub_frame" in type_:  # X"html"
83
                        TrafficLogParser.HtmlParse(contents)
84 85
                        print("<h2>Integral content:</h2>")
                        print("<br>\n".join(escape(contents).split("\n")))
86
                    elif "script" in type_:  # X"javascript"
87
                        TrafficLogParser.JsParse(contents)
88 89
                    else:  # output file of an unknown type in plain-text at least
                        print(escape(contents))
90 91

                data = buf.getvalue()
92

Edvard Rejthar's avatar
Edvard Rejthar committed
93
                with open(nice_file, "w", encoding="utf-8") as f2:  # zapsat hezke formatovani do souboru
94
                    f2.write(data)
95
                buf.close()
96

97 98
                return data

Edvard Rejthar's avatar
Edvard Rejthar committed
99
    @staticmethod
100
    def HtmlParse(contents):
101 102
        soup = BeautifulSoup(contents, "html.parser")
        found = 0
103

104 105
        def pygment(text):
            nonlocal found
106
            try:
107 108
                print(highlight(str(text), HtmlLexer(), HtmlFormatter()))
                found += 1
109
            except TypeError:
110 111
                # with open("/tmp/mdm/pomoc","w") as f: f.write(text)
                # logger.warning("{}".format(text))
Edvard Rejthar's avatar
Edvard Rejthar committed
112 113 114
                logger.error("mdmaug pygment: QUIT")
                print("I should have never come here; mdmaug - pygment.")
                quit()  # mozna rozbity js rozbil i tady pana
115 116 117

        for tag in soup.find_all():
            # vypise potencialne nebezpecne tagy
118
            if bool(len([True for a in tag.attrs if (a.startswith('on'))])):  # ma dynamicky js content v atributu
119
                pygment(tag)
120 121
            if tag.name in ["meta", "link", "frame", "iframe", "object", "embed"]:
                # XX link-rel only, meta-redirect only? Jsou jeste jine atributy nebezpecne
122
                pygment(tag)  # iframe, object, embed -> vypsat cele tagy
123
            if tag.name == "img":
124 125
                # zajima nas pouze atribut src, ale pro jasnost, ze jde o tag, tam vratime zobacky
                pygment(f"<img src={tag.get('src')} />")
126 127

        for tag in soup.find_all("script"):  # na konci vsechny skripty -> js parser
128
            inner, tag.contents = tag.contents, []
129 130 131
            pygment(tag)  # vytisknout <script> bez vnitrku
            if len(inner):  # pokud mel script vnitrek, parsovat ho pomoci js
                for subtag in inner:  # subtag ma byt jen jeden, a to obsah skriptu. Nevylucuju ale existenci nevalidnich subtagu, ktere kdyby existovaly, tak o nich chci vedet. Proto nepouziju rovnou inner[0]
132
                    TrafficLogParser.JsParse(subtag)
133 134
        if not found:
            print("Nothing suspicious detected.")
135

136

Edvard Rejthar's avatar
Edvard Rejthar committed
137
    @staticmethod
138 139
    def JsParse(contents):
        # eval,document.write, window.open, open, window.location, location, document.location , document.cookie
140
        # return
Edvard Rejthar's avatar
Edvard Rejthar committed
141
        # logger.debug(contents)
142
        # contents = """<script>var a = 1;</script>"""
Edvard Rejthar's avatar
Edvard Rejthar committed
143
        # logger.debug("<xmp>",jsbeautifier.beautify(str(contents)),"</xmp>")
144 145
        # return
        print(highlight(str(contents), MdmaugJsLexer(), HtmlFormatter()))
146

147
    # def getOutput(self):return self.output
148

Edvard Rejthar's avatar
Edvard Rejthar committed
149
    @staticmethod
150
    def getStylesheet():
151 152 153 154 155 156 157
        style = ".gr {color:white!important;background:red;font-size:150%;}"  # zvyrazneni podezrelych veci
        return "<style>" + HtmlFormatter().get_style_defs('.highlight') + style + "</style>"

# from timeit import default_timer as timer
# start = timer()
# TrafficLogParser()
# end = timer()
Edvard Rejthar's avatar
Edvard Rejthar committed
158
# logger.debug(end- start)