scan_controller.py 12.4 KB
Newer Older
1
import datetime
2 3
import json
import logging
4
import os
5
import shutil
6
import subprocess
7
import time
8
import traceback
9
from glob import glob
10
from itertools import combinations_with_replacement
Edvard Rejthar's avatar
Edvard Rejthar committed
11
from json import JSONDecodeError
12
from random import randint
Edvard Rejthar's avatar
Edvard Rejthar committed
13 14

from filelock import FileLock
15
from flask import escape
16

17
from ..config import Config
Edvard Rejthar's avatar
Edvard Rejthar committed
18
from ..domains import domain2dir, assure_url, url2domain
19 20 21 22 23 24
from ..model.crawl import Crawl
from ..parser.metadata_parser import MetadataParser
from ..parser.nspr_log_parser import NsprLogParser
from ..parser.screenshot_parser import ScreenshotParser
from ..parser.spy_parser import SpyParser
from ..parser.traffic_log_parser import TrafficLogParser
25

Edvard Rejthar's avatar
Edvard Rejthar committed
26 27
logger = logging.getLogger("mdmaug")

28 29

class ScanController:
30 31
    FF_INFO_FILE = "cache.dir"

Edvard Rejthar's avatar
Edvard Rejthar committed
32 33 34
    profile = "-1"  # booked browser profile

    url = None
35

36 37 38 39
    def __init__(self):
        self.lock = None

    @staticmethod
40
    def get_scan(domain, scan: str = ""):
41
        """
Edvard Rejthar's avatar
Edvard Rejthar committed
42
        :param domain: hostname # XX the argument is not domain, but domain2dir(domain) (or something)
43
        :param scan: time of scan, if not specified, we return the last
44 45
        :return: Crawl object
        """
46
        scans = ScanController().get_domain_scans(domain)
47 48 49
        if not scans:
            return False
        else:
50 51
            if scan:
                if scan not in scans:
52 53 54
                    return "Scan wasn't performed at this time."
            else:
                scan = scans[0]
55
            return Crawl.load_from_scan(domain, scan)
56

Edvard Rejthar's avatar
Edvard Rejthar committed
57
    @staticmethod
58
    def get_domain_scans(domain):
Edvard Rejthar's avatar
Edvard Rejthar committed
59 60 61 62 63
        """
            # XX listdir should be ordered from the most recent (or the highest in the alphabet)
            # XX the argument is not domain, but domain2dir(domain) (or something)
        """

Edvard Rejthar's avatar
Edvard Rejthar committed
64 65
        d = Config.CACHE_DIR + domain + "/"
        if os.path.isdir(d):
66 67
            # all possible snapshot directories
            return [scan for scan in os.listdir(d) if os.path.isdir(d + scan)]
Edvard Rejthar's avatar
Edvard Rejthar committed
68 69
        else:
            return []
70

71
    def launch(self, url, cached=None, autoprune=False, creation_spree=False):
Edvard Rejthar's avatar
Edvard Rejthar committed
72
        """
73 74
        :param creation_spree: if true and using cache, we'll skip an existing analysis. (Good when batch-analysing a large list.)
        :type autoprune: bool, if true and using cache, we'd delete an old analysis that is not complete rather than returning it.
Edvard Rejthar's avatar
Edvard Rejthar committed
75
        :param url: scanned url
Edvard Rejthar's avatar
Edvard Rejthar committed
76 77
        :type cached: True = Any cached version, int = cached version X days old. If None or not found, site will be reanalysed
        """
78 79
        url = assure_url(url)
        if not url:
80
            return f'Invalid URL {escape(url)} {url}'
Edvard Rejthar's avatar
Edvard Rejthar committed
81
        else:
82 83 84 85
            self.url = url

        if cached:
            domain = domain2dir(url)
86 87 88 89
            # scans = self.get_domain_scans(domain)
            prep = Config.CACHE_DIR + domain + "/"
            scans = sorted(self.get_domain_scans(domain), key=lambda s: os.path.getmtime(prep + s))
            while scans:
90
                # get the most recent snapdir and check if it's not too old
91 92 93 94 95 96 97 98 99 100 101 102 103
                scan = scans.pop()
                if autoprune and self.clean_scan(domain, scan):
                    continue
                # if not autoprune or not self.clean_scan(domain, scan):
                if cached is True or os.path.getmtime(prep + scan) > time.time() - (3600 * 24 * cached):
                    if creation_spree:
                        return f"Scan for {domain} already exists."
                    try:
                        logger.debug(f"Returning a previous crawl from: {domain}/{scan}")
                        crawl = Crawl.load_from_scan(domain, scan)
                        return crawl
                    except ValueError:
                        break
104
            logger.debug(f"({-1}) Convenient cached analysis not found for url {url}")
105

106
        # perform fresh analysis
Edvard Rejthar's avatar
Edvard Rejthar committed
107
        self.lock = FileLock(Config.config_file + ".lock")
108
        if self.queue(url):  # /api/analyze/web - queue current_threads analysis
Edvard Rejthar's avatar
Edvard Rejthar committed
109 110
            print(f"({self.profile}) start crawl")
            # noinspection PyBroadException
111 112 113
            try:
                crawl = self.analyze()
            except Exception as e:
Edvard Rejthar's avatar
Edvard Rejthar committed
114 115
                logger.debug(f"({self.profile}) PROFILE EXCEPTION")
                logger.debug(traceback.format_exc())
Edvard Rejthar's avatar
Edvard Rejthar committed
116 117
                # (if the problem is FF is killed by JS, you may experiment with ulimit -Sv 500000)
                return f"PROFILE EXCEPTION ({self.profile}) {e} See syslog."
118

119
            crawl.save_to_file()  # save search results
120
            return crawl
121

Edvard Rejthar's avatar
Edvard Rejthar committed
122 123
        else:
            logger.debug("(-) no free slots")
124
            result = f"Scanning {self.url} failed – no free slots. <a href='{Config.APP_HOST}/reset'>Reset</a>"
Edvard Rejthar's avatar
Edvard Rejthar committed
125
        return f"<div id='analysis-results'>{result}</div>"
126 127

    def analyze(self):
Edvard Rejthar's avatar
Edvard Rejthar committed
128
        """ Run Firefox under a profile. """
Edvard Rejthar's avatar
Edvard Rejthar committed
129
        print(f"({self.profile}) browser launch")
130

Edvard Rejthar's avatar
Edvard Rejthar committed
131
        log_dir, cache_dir = self.assure_dirs()  # prepare log & cache directories
132

Edvard Rejthar's avatar
Edvard Rejthar committed
133
        logfile = log_dir + "log{}.log".format(self.profile)
134

135
        # ,nsSocketTransport:5,nsStreamPump:5,nsHostResolver:5
Edvard Rejthar's avatar
Edvard Rejthar committed
136 137
        logger.debug("({}) FF -P {} -no-remote {}".format(self.profile, self.profile, self.url))
        # http://localhost/redirect/ gets stripped by the extension
138 139
        command = f"NSPR_LOG_MODULES=timestamp,nsHttp:5 NSPR_LOG_FILE={logfile} CACHE_DIR={cache_dir} PROFILE={self.profile}" \
                  f" {Config.browser} -P {self.profile} -no-remote 'http://localhost/redirect/{self.url}'"
Edvard Rejthar's avatar
Edvard Rejthar committed
140 141

        # terminate Config.browser if he's not able to
142
        # (everything has to be in single command because there is no inheritance of $! amongst subprocesses)
Edvard Rejthar's avatar
Edvard Rejthar committed
143
        command += f" & echo $!;ii=0; while [ -n \"`ps -p $! | grep {Config.browser}`\" ];" \
144
                   f" do echo \"({self.profile}) running\" ;ii=$((ii+1)); if [ $ii -gt {Config.MAX_BROWSER_RUN_TIME} ];" \
Edvard Rejthar's avatar
Edvard Rejthar committed
145 146
                   f" then echo '({self.profile}) kill';kill $!; break;fi; sleep 1; done"  # > /dev/null
        logger.debug(command)
147
        subprocess.call([command], shell=True)
Edvard Rejthar's avatar
Edvard Rejthar committed
148
        logger.debug(f"({self.profile}) stopped!")
149
        # gather analysis information
150

Edvard Rejthar's avatar
Edvard Rejthar committed
151
        crawl = Crawl(host=self.url, log_dir=log_dir, cache_dir=cache_dir, profile=self.profile)
152 153

        expiration = 0
154
        while not os.path.isfile(logfile):  # it may take some time to write the file even after the FF closes
155
            expiration += 1
Edvard Rejthar's avatar
Edvard Rejthar committed
156
            logger.debug(f"({self.profile}) waiting to close...")
157
            if expiration > Config.MAX_BROWSER_EXPIRATION:
Edvard Rejthar's avatar
Edvard Rejthar committed
158
                logger.debug(f"({self.profile}) time is run!")
159 160
                raise FileNotFoundError("time is run - browser expired")
            time.sleep(1)
161

162
        NsprLogParser(logfile, crawl)
Edvard Rejthar's avatar
Edvard Rejthar committed
163 164
        self.dequeue()
        TrafficLogParser(crawl)
165
        SpyParser(crawl)
Edvard Rejthar's avatar
Edvard Rejthar committed
166
        MetadataParser(crawl, url2domain(self.url))
167
        ScreenshotParser(crawl)
168
        print("({}) thread parsers ends".format(self.profile))
169 170
        return crawl

Edvard Rejthar's avatar
Edvard Rejthar committed
171 172
    @staticmethod
    def _get_cache_dir_stamp():
173 174
        return datetime.datetime.fromtimestamp(time.time()).strftime('%y%m%d%H%M%S')

Edvard Rejthar's avatar
Edvard Rejthar committed
175 176 177 178 179 180
    @staticmethod
    def _assure_dir(dir_name):
        """ Dir is created or cleaned up to be ready for use. """
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
        for file in glob(dir_name + "*"):  # log dir promazat - # pokud byl, smaze jeho obsah z minula (stara analyza)
181
            os.remove(file)
Edvard Rejthar's avatar
Edvard Rejthar committed
182
        return dir_name
183

Edvard Rejthar's avatar
Edvard Rejthar committed
184
    def assure_dirs(self):
185 186 187
        """ Vytvori adresar logu a cache, pokud nejsou
            Cache ex: /home/mdmaug/.cache/mdmaug-scans/example.com/150507151215/ - sem nageneruje xpi logy
        """
Edvard Rejthar's avatar
Edvard Rejthar committed
188 189 190
        log_dir = ScanController._assure_dir(Config.LOG_DIR + str(self.profile) + "-log/")
        cache_dir = ScanController._assure_dir(
            Config.CACHE_DIR + domain2dir(self.url) + "/" + ScanController._get_cache_dir_stamp() + "/")
191 192

        # info pro FF
193
        with open(log_dir + ScanController.FF_INFO_FILE, "w") as f:  # v logDiru mu dame odkaz do cache_diru
Edvard Rejthar's avatar
Edvard Rejthar committed
194
            f.write(cache_dir)  # napoveda, kde FF najde cache dir (protoze FF najde log dir podle nazvu profilu)
195

Edvard Rejthar's avatar
Edvard Rejthar committed
196
        return log_dir, cache_dir
197

198 199
    @staticmethod
    def _load_profile_queue():
Edvard Rejthar's avatar
Edvard Rejthar committed
200
        # load queue from config file        
201
        try:
Edvard Rejthar's avatar
Edvard Rejthar committed
202 203 204 205
            with open(Config.config_file, 'r') as f:
                queue = json.load(f)
        except (IOError, JSONDecodeError):
            with open(Config.config_file, 'w'):
206
                pass
Edvard Rejthar's avatar
Edvard Rejthar committed
207 208
            queue = {}
        return queue
209

210 211
    @staticmethod
    def _save_profile_queue(queue):
Edvard Rejthar's avatar
Edvard Rejthar committed
212 213
        with open(Config.config_file, 'w') as f:
            json.dump(queue, f)
214

Edvard Rejthar's avatar
Edvard Rejthar committed
215 216 217
    def dequeue(self):
        with self.lock:
            queue = self._load_profile_queue()
218
            try:
Edvard Rejthar's avatar
Edvard Rejthar committed
219 220 221 222 223 224 225
                queue.pop(str(self.profile))
                self._save_profile_queue(queue)
            except KeyError:
                logger.debug("Unbook failed")
                logger.debug(queue)
                exit(0)
                raise
226
            except OSError:
Edvard Rejthar's avatar
Edvard Rejthar committed
227 228 229 230 231 232 233 234 235 236 237 238 239 240
                logger.debug(f"({self.profile}) OS Error - interference with a running browser consuming too much memory. "
                             f"Let's wait 10 s.")
                time.sleep(10)
                try:
                    self._save_profile_queue(queue)
                except OSError:
                    logger.debug(f"({self.profile}) System didn't recover.")
                    return "Memory may be exhausted. See mdmaug-server/scan_controller.py for details."
                    # FF used up all the memory. URL is problematic. In my opinion, UrlQuery would fail too.

    def queue(self, url):
        """ Reads from queue.cache what profile is available and books it
        :return: Have we succeeded to book a browser profile?
        """
241
        self.profile = -1
Edvard Rejthar's avatar
Edvard Rejthar committed
242 243 244 245 246 247
        for _ in range(4):  # wait for a free slot several times
            with self.lock:
                queue = self._load_profile_queue()
                for i in range(Config.profile_count):
                    if queue.get(str(i)) is None:
                        self.profile = i
248
                        queue[self.profile] = url
Edvard Rejthar's avatar
Edvard Rejthar committed
249 250
                        self._save_profile_queue(queue)
                        return True  # we found a free slot, let's proceed
251
            if self.profile == -1:
Edvard Rejthar's avatar
Edvard Rejthar committed
252 253 254 255
                logger.debug("(-1) FULL, let's wait few secs")
                time.sleep(randint(5, 10))
        else:
            return False
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286

    @classmethod
    def prune(cls):
        scan_count = 0
        domain_count = set()
        for domain, scan in cls.get_all_scans():
            if cls.clean_scan(domain, scan):
                scan_count += 1
                domain_count.add(domain)

        return f"Pruned {scan_count} scans at {len(domain_count)} domains."

    @staticmethod
    def get_all_scans():
        """ Generates all domains/scans.
            Notice: temporarily changes CWD.
            (I don't think tha
            t might be a problem and it's easy to implement.)
            :rtype: (str, str)
        """
        cwd = os.getcwd()
        os.chdir(Config.CACHE_DIR)
        for domain, scans, _ in walklevel(".", 1):
            if domain in [".", "./_tmp"]:  # ignore root dir and browser _tmp dir
                continue
            for scan in scans:
                yield domain, scan
        os.chdir(cwd)

    @classmethod
    def clean_scan(cls, domain, scan: str):
287
        """ If analysis.json is missing or there is only 1 file in the directory, deletes files of a scan.
288 289 290
            If that was the only scan, domain directory is deleted as well. (DB stays intact.)

            Because if a scan fails, there is only analysis.json or nothing in the dir.
291
            Return True if scan was deleted, False if nothing was deleted.
292 293
            """
        scan_path = os.path.join(Config.CACHE_DIR, domain, scan)
294
        if not os.path.isfile(os.path.join(scan_path, Config.CRAWL_FILE)) or len(os.listdir(scan_path)) <= 1:
295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311
            shutil.rmtree(scan_path)
            domain_path = os.path.join(Config.CACHE_DIR, domain)
            if len(os.listdir(domain_path)) == 0:
                shutil.rmtree(domain_path)
            return True
        return False


def walklevel(some_dir, level=1):
    some_dir = some_dir.rstrip(os.path.sep)
    assert os.path.isdir(some_dir)
    num_sep = some_dir.count(os.path.sep)
    for root, dirs, files in os.walk(some_dir):
        yield root, dirs, files
        num_sep_this = root.count(os.path.sep)
        if num_sep + level <= num_sep_this:
            del dirs[:]