crawl.py 5.44 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
from yaml import load, dump
try:
    from yaml import CLoader as Loader, CDumper as Dumper
except ImportError:
    from yaml import Loader, Dumper
from collections import defaultdict
from lib.config import Config

class Crawl(defaultdict):
    """  Objekt Crawl udružuje výsledky vyhledávání """
    
    def __str__(self):
        r = "Výsledky vyhledávání - navštíveno {} domén".format(len(self))
        for key in self.keys():
            r += "\n* " + key + " " + str(self[key])
        return r


    def saveToFile(self,filename):
        with open(filename, "w") as f:
            f.write(dump(self.__getstate__(), Dumper=Dumper))

    def loadFromFile(filename):
        with open(filename, 'r') as f:
            return Crawl(state = load(f.read(), Loader=Loader))



29
    def __init__(self, host = None, state = None, logDir =  None, cacheDir = None):
30 31 32 33
        """ State muze obsahovat vystup __getstate__() (serializace YAMLem) """
        self.default_factory = _Domain
        self.screenfile = None # HTML output XXX

34 35
        if host:
            self.host = host
36 37 38 39 40 41 42 43 44 45 46 47 48 49
        if logDir:
            self.logDir = logDir
        if cacheDir:
            self.cacheDir = cacheDir
        if state:
            self.__setstate__(state)
        pass

    def __getstate__(self):
        state = self.__dict__.copy()
        state["keys"] = [[x for x in (key, self[key].__getstate__())] for key in self.keys()]
        return state

    def __setstate__(self,state):
50
        #logging.debug("fdsfsfds",self.__dict__)
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
        for tup in state["keys"]:
            key, val = tup
            self[key].__setstate__(val)
        del state["keys"]
        self.__dict__ = state    

class _Domain(defaultdict):
    """ Navstivena domena behem crawlu """
    def __str__(self):
        r = "{} adres a {} url".format(len(self.addresses), len(self.urls))
        for key in self.urls.keys():
            r += "\n " + key + " " + str(self.urls[key])
        for key in self.addresses.keys():
            r += "\n " + key + " " + str(self.addresses[key])
        if self.pdns:
            r += "Informace z PDNS:\n"
            for key in self.pdns:
                r += key + " "
        else:
            r += "\n Žádné informace z PDNS."
        return r

    def __init__(self):
        #self.vote = None
        #self.urls = set()
        self.urls = defaultdict(_Url)
        self.addresses = defaultdict(_Address)
        self.pdns = set()

    def __getstate__(self):
        state = self.__dict__.copy()
        state["addresses"] = [[x for x in (key, self.addresses[key].__dict__)] for key in self.addresses]
        state["urls"] = [[x for x in (key, self.urls[key].__dict__)] for key in self.urls]
        return state
        #return {'urls': self.urls, 'vote':self.vote, 'addresses':
        #        [[x for x in (key, self.addresses[key].__dict__)] for key in self.addresses]}

    def __setstate__(self, state):
        for tup in state["addresses"]:
            key, val = tup
            self.addresses[key].__dict__ = val
        del state["addresses"]

        for tup in state["urls"]:
            key, val = tup
            self.urls[key].__dict__ = val
        del state["urls"]

        for key in state:
            self.__dict__[key] = state[key]
        return
      

class _Url(set):
    """ Unikatni navstivena url """
    def __str__(self):
        return str(self.__dict__)
        #return "spyfile {} {}, vote {}".format(self.spyfile, self.city, self.vote)

    def __init__(self):
111 112
        self.spyfile = [] # cesta k souboru se podezrelym kodem poustenym strankou
        self.sourcefile = [] # cesta k souboru se zdrojovym kodem. Muze jich byt vice, http://seznam.cz/index.htm a https://seznam.cz/index.htm jsou oba pod domenou seznam.cz
113

114 115 116 117
    def addSourcefile(self, sourcefile):
        #if self.sourcefile != None:
        #    raise RuntimeError('sourcefile', 'uz bylo definovano ' + self.sourcefile)  # na tohle nejsme pripraveni - asi funkce v main.js pro jmeno souboru je spatna, protoze je jina od Domains.funkce
        self.sourcefile.append(sourcefile)
118

119 120 121 122
    def addSpyfile(self, spyfile):
        #if self.spyfile != None:
        #    raise RuntimeError('spyfile', 'uz bylo definovano')  # na tohle nejsme pripraveni - asi funkce v main.js pro jmeno souboru je spatna, protoze je jina od Domains.funkce
        self.spyfile.append(spyfile)
123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143

class _Address(set):
    """ Adresa navstivene domeny """
    #def __getstate__(self):
    #    return {self.country,self.city, self.vote}

    #def __setstate__(self):
    #    pass

    def __str__(self):
        return "K adrese je připojena geolokace {} {}".format(self.country, self.city) # , vote {} , self.vote

    def __init__(self):
        self.country = None
        self.city = None        
        #self.vote = None

"""
Example (and yaml-serialization check):
c = Crawl()
c["seznam.cz"].urls["/aurl"].spyfile = "/soubor-spyfil"
144
c["seznam.cz"].urls["/aurl"].sourcefiles.add("/1.source") tady ma byt asi append, ne?
145 146 147 148 149 150 151 152 153
c["seznam.cz"].urls["/aurl"].sourcefiles.add("/2.source")
c["seznam.cz"].addresses["8.8.8.8"]
c["seznam.cz"].addresses["9.5.2.1"]
c["seznam.cz"].addresses["8.8.8.8"].country = "preague"
#c["centrum.cz"].addresses["8.8.8.8"].vote = "yes"
#c.__getstate__()
e = Crawl()
e.__setstate__( c.__getstate__() )
#e = dill.loads(dill.dumps(c))
154
logging.debug(str(c) == str(e))
155 156


157
logging.debug(c)
158 159 160
output = dump(c.__getstate__(), Dumper=Dumper)
e = Crawl()
e.__setstate__(load(output, Loader=Loader))
161 162
logging.debug(e)
logging.debug(str(c) == str(e))
163
"""