traffic_log_parser.py 6.54 KB
Newer Older
1 2
# Zpracuje log z firefoxu. Ten v tmp necha .tmp fily s html a js a screenshot.
import io
3
import logging
4
from contextlib import redirect_stdout
5
from html import escape
6
from os import listdir
7
from os.path import isfile, join, splitext
8

9
from bs4 import BeautifulSoup
10 11 12
from pygments import highlight
from pygments.formatters import HtmlFormatter
from pygments.lexers import HtmlLexer
13

Edvard Rejthar's avatar
Edvard Rejthar committed
14
from ..domains import url2path, url2domain
15
from ..model.dbp import Whitelist  # Status, Export, Encounter,
16 17 18 19 20
# from pygments.lexers import JavascriptLexer
# from pygments.styles import get_style_by_name
# import jsbeautifier
from ..parser.mdmaug_js_lexer import MdmaugJsLexer

Edvard Rejthar's avatar
Edvard Rejthar committed
21
logger = logging.getLogger("mdmaug")
22

23

24 25 26 27 28 29
class TrafficLogParser:
    """ Obohatit vysledky vyhledavani (objekt crawl) o odkazy na hezke vypisy zdrojovych kodu """

    def __init__(self, crawl):
        """ Projde soubory se zdrojovymi kody a pripoji je ke crawlu """
        # log files
30
        for file in [f for f in listdir(crawl.cache_dir) if isfile(join(crawl.cache_dir, f)) and splitext(f)[1] == ".tmp"]:
31
            # if file in ('screenshot_base64.txt', 'screenshot_debug.html'): continue
Edvard Rejthar's avatar
Edvard Rejthar committed
32
            # logger.debug(file)
33
            path = crawl.cache_dir + file
34
            with open(path, 'r') as f:
Edvard Rejthar's avatar
Edvard Rejthar committed
35
                # logger.debug("traffic %s", path)
36 37
                mime = ""
                try:
38 39
                    url = f.readline().rstrip().split(" ", 1)  # prvni radek obsahuje url a mime-type dat
                    if len(url) == 1 and url[0] == "":
40
                        raise ValueError
41
                    if len(url) == 2:
42
                        url, mime = url
43
                except ValueError:  # v souboru chybi hlavicka, asi byl zabit prohlizec, co to psal, preskocit
Edvard Rejthar's avatar
Edvard Rejthar committed
44
                    logger.debug(f"({crawl.profile}) no contents fetched")
45 46 47 48 49
                    continue

                # kdyz je domena whitelistovana, preskocime ji                
                if (Whitelist.matches(url)):
                    continue
50

Edvard Rejthar's avatar
Edvard Rejthar committed
51
                # logger.debug(Domains.url2domain(url), Domains.url2path(url), path)
52 53 54
                o = crawl[url2domain(url)].urls[url2path(url)]
                if f.readline() != "":  # some content has been fetched
                    o.sourcefiles.append(path)
55

Edvard Rejthar's avatar
Edvard Rejthar committed
56 57
    @staticmethod
    def nicify_file(sourcefile):
58
        """ Returns nicified output of a .tmp file containing the source codes """
59 60 61
        nice_file = sourcefile + ".htm"
        if isfile(nice_file):
            with open(nice_file, "r") as f:
62 63 64
                return f.read()
        else:
            with open(sourcefile, 'r') as f:
65 66 67
                type_ = ""
                # first row is URL a type of stream (since Webextensions no more mime type)
                url = f.readline().rstrip().split(" ", 1)
68
                if len(url) == 2:
69
                    url, type_ = url
70

71
                contents = f.read()  # the rest of the file contains source data
72

73 74 75
                if contents == "":
                    return "error: no content fetched"

76
                buf = io.StringIO()
77
                with redirect_stdout(buf):  # print -> promenna
78
                    print(f"<h1>{url}</h1>")
79
                    if "main_frame" in type_ or "sub_frame" in type_:  # X"html"
80
                        TrafficLogParser.HtmlParse(contents)
81 82
                        print("<h2>Integral content:</h2>")
                        print("<br>\n".join(escape(contents).split("\n")))
83
                    elif "script" in type_:  # X"javascript"
84
                        TrafficLogParser.JsParse(contents)
85 86
                    else:  # output file of an unknown type in plain-text at least
                        print(escape(contents))
87 88

                data = buf.getvalue()
89 90 91

                with open(nice_file, "w") as f2:  # zapsat hezke formatovani do souboru
                    f2.write(data)
92
                buf.close()
93

94 95
                return data

Edvard Rejthar's avatar
Edvard Rejthar committed
96
    @staticmethod
97
    def HtmlParse(contents):
98 99
        soup = BeautifulSoup(contents, "html.parser")
        found = 0
100

101 102
        def pygment(text):
            nonlocal found
103
            try:
104 105
                print(highlight(str(text), HtmlLexer(), HtmlFormatter()))
                found += 1
106
            except TypeError:
107 108
                # with open("/tmp/mdm/pomoc","w") as f: f.write(text)
                # logger.warning("{}".format(text))
Edvard Rejthar's avatar
Edvard Rejthar committed
109 110 111
                logger.error("mdmaug pygment: QUIT")
                print("I should have never come here; mdmaug - pygment.")
                quit()  # mozna rozbity js rozbil i tady pana
112 113 114

        for tag in soup.find_all():
            # vypise potencialne nebezpecne tagy
115
            if bool(len([True for a in tag.attrs if (a.startswith('on'))])):  # ma dynamicky js content v atributu
116
                pygment(tag)
117 118
            if tag.name in ["meta", "link", "frame", "iframe", "object", "embed"]:
                # XX link-rel only, meta-redirect only? Jsou jeste jine atributy nebezpecne
119
                pygment(tag)  # iframe, object, embed -> vypsat cele tagy
120
            if tag.name == "img":
121 122
                # zajima nas pouze atribut src, ale pro jasnost, ze jde o tag, tam vratime zobacky
                pygment(f"<img src={tag.get('src')} />")
123 124

        for tag in soup.find_all("script"):  # na konci vsechny skripty -> js parser
125
            inner, tag.contents = tag.contents, []
126 127 128
            pygment(tag)  # vytisknout <script> bez vnitrku
            if len(inner):  # pokud mel script vnitrek, parsovat ho pomoci js
                for subtag in inner:  # subtag ma byt jen jeden, a to obsah skriptu. Nevylucuju ale existenci nevalidnich subtagu, ktere kdyby existovaly, tak o nich chci vedet. Proto nepouziju rovnou inner[0]
129
                    TrafficLogParser.JsParse(subtag)
130 131
        if not found:
            print("Nothing suspicious detected.")
132

133

Edvard Rejthar's avatar
Edvard Rejthar committed
134
    @staticmethod
135 136
    def JsParse(contents):
        # eval,document.write, window.open, open, window.location, location, document.location , document.cookie
137
        # return
Edvard Rejthar's avatar
Edvard Rejthar committed
138
        # logger.debug(contents)
139
        # contents = """<script>var a = 1;</script>"""
Edvard Rejthar's avatar
Edvard Rejthar committed
140
        # logger.debug("<xmp>",jsbeautifier.beautify(str(contents)),"</xmp>")
141 142
        # return
        print(highlight(str(contents), MdmaugJsLexer(), HtmlFormatter()))
143

144
    # def getOutput(self):return self.output
145

Edvard Rejthar's avatar
Edvard Rejthar committed
146
    @staticmethod
147
    def getStylesheet():
148 149 150 151 152 153 154
        style = ".gr {color:white!important;background:red;font-size:150%;}"  # zvyrazneni podezrelych veci
        return "<style>" + HtmlFormatter().get_style_defs('.highlight') + style + "</style>"

# from timeit import default_timer as timer
# start = timer()
# TrafficLogParser()
# end = timer()
Edvard Rejthar's avatar
Edvard Rejthar committed
155
# logger.debug(end- start)