nspr_log_parser.py 1.19 KB
Newer Older
1 2
import logging
import os
Edvard Rejthar's avatar
Edvard Rejthar committed
3 4 5 6 7 8 9 10
import re

import humanize

from ..domains import url2domain

logger = logging.getLogger("mdmaug")

11 12 13 14 15

class NsprLogParser:
    """ Obohatit vysledky vyhledavani (objekt crawl) o url z NSPR log filu FF """

    def __init__(self, logfile, crawl):
Edvard Rejthar's avatar
Edvard Rejthar committed
16
        # analyzovat log file
17
        with open(logfile, 'r') as f:
Edvard Rejthar's avatar
Edvard Rejthar committed
18 19 20 21 22
            urls = re.findall(
                '(((http|ftp|https):\/\/)([\w\-_]+(?:(?:\.[\w\-_]+)+))([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?)',
                f.read())
        logger.debug(f"({crawl.profile}) log size: {humanize.naturalsize(os.path.getsize(logfile))}")
        # XXXsubprocess.call(["rm",Config.LOG_DIR + "log"+str(ScanController.profile )+".txt"]) smazat logfile
23 24 25 26 27

        for i in urls:
            domain = i[1] + i[3]
            path = i[4]

Edvard Rejthar's avatar
Edvard Rejthar committed
28
            # crawl[domain] # XX pokud bude jisto, ze pokud je na domene -1, je tam i nejaa jina, tenhle radek muzu vynechat
29
            if path == ":-1":
Edvard Rejthar's avatar
Edvard Rejthar committed
30
                continue  # an internal FF value, present on every site
31 32 33
            if path == "":  # I used to have redundant "/" and "" record for the analysed domain
                path = "/"

Edvard Rejthar's avatar
Edvard Rejthar committed
34
            crawl[url2domain(domain)].urls[path]