metadata_parser.py 8.18 KB
Newer Older
1 2 3 4
import datetime
import logging
import threading
from lib.config import Config
5 6 7 8
from lib.model.dbp import Export
from lib.model.dbp import Status
from lib.model.dbp import Turris
from lib.model.dbp import Whitelist
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
from lib.domains import Domains

class MetadataParser:
    """ Obohatit vysledky vyhledavani (objekt crawl) o whois informace a informace z db"""

    ###
    ## Prida do objektu crawl aktualni informace z db, vote jednotlivych domen.
    #def addFreshData(self, crawl):
    #    for domain in crawl.keys():
    #        crawl[domain].vote = "0"


    def __init__(self, crawl, websiteDomain):
        self.websiteDomain = websiteDomain

        #kazda domena vyvola vlastni thread - trva nacist jeji geoIP
        domains = list(crawl.keys())
        domainThreadNumber = 0
        while len(domains): # spusti maximalne 10 threadu doraz, jednou mi to totiz preteklo (kazda domena spusti jeste tolik threadu, kolik ma IP, ale tech byva jen par)
            threads = []
            count = 0            
            while len(domains):#for domain in domains.pop:
                count += 1
                if count > Config.MAX_WHOIS_DOMAIN_THREADS:
                    break
                domain = domains.pop()
                domainThreadNumber += 1
                t = threading.Thread(target=self.addDomain, args=(crawl[domain], domain, domainThreadNumber))
                threads.append(t)
Edvard Rejthar's avatar
Edvard Rejthar committed
38
                t.start()                
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
            #konsolidovat informace o domenach
            for thread in threads:
                thread.join()


        #sesumirovat informace o domene
    def addDomain(self, crawlDomain, domainEncountered,domainThreadNumber):
        domainNaked = Domains.url2domain(domainEncountered) #domainEncountered[domainEncountered.find("//") + 2:] # http://seznam.cz -> seznam.cz; //ajax.googleapis.com -> ajax.googleapis.com
        logging.debug("domena: " + domainNaked)
        if domainNaked in [self.websiteDomain, "127.0.0.1", "localhost"]: #domena samu sebe ignoruje. A kdybych nekdy zablokoval localhost, popravili by me.
            logging.debug("skip itself")
        else:
            if Whitelist.matches(domainEncountered): #je domena ve whitelistu 2ndLD domen?
                logging.debug("skip whitelisted")
            else:
                #domena neni whitelistovana

                # nacist IP
                #vote = None
                threads = []
                #queueIp = queue.Queue()
60
                #logging.debug("DOMAINNAKED {}".format(domainNaked))
61 62 63 64 65 66 67 68 69
                threadNumber = 0
                for ip_frame in Domains.get_ips_for_host(domainNaked): # (10, 1, 6, '', ('2001:888:2000:d::a2', 80, 0, 0))
                    threadNumber += 1
                    ip = ip_frame[4][0]
                    t = threading.Thread(target=self.addAddress, args=(crawlDomain.addresses[ip], ip, domainEncountered, crawlDomain.pdns, threadNumber,domainThreadNumber)) #r += self.addAddress(ip_frame)
                    threads.append(t)
                    t.start()
                #konsolidovat informace o IP v domenach
                for thread in threads:
70
                    thread.join()
71
                #logging.debug("("+str(threadNumber),"EDVARD 9")
72 73 74
                if len(threads) == 0: #domena zadne IP nema, je pozastavena
                    #presto chceme evil host zapsat - alespon s ip null
                    #vote =
75
                    self.addAddress(None, None, domainEncountered,crawlDomain.pdns,0,0)
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
                #if vote == None:vote = ""

                #crawlDomain.vote = vote


    ##
    #  Stat a mesto IPcka - nacitame threadovane, kazdy zvlast trva vterinu
    # crawlDomainIp je objekt Address
    # ip je klic, ktery z objektu Domain vytahne nas objekt Address.
    #
    def addAddress(self, crawlDomainIp, ip, remoteHost, pdns, threadNumber,domainThreadNumber):
        #vote = "n/a"        
        #import pdb;pdb.set_trace()
        #logging.debug("!!("+str(threadNumber)+","+str(domainThreadNumber)+") REMOTE "  + " host: " + str(remoteHost))
        with Config.lock:
            #logging.debug("!("+str(threadNumber)+","+str(domainThreadNumber)+") REMOTE "  + " host: " + str(remoteHost))
            #updatovat domene timestamp setkani, aby byla v dalsim exportu #X Db.cur.            
            #Db.cur = Db.connection.cursor()
            if ((Turris.update(timestamp=datetime.datetime.now()).where(Turris.remoteHost == remoteHost).execute() == 0  #domena v db jeste nema jmeno (mozna tam ma IP)   Xdomena nema ip, v databazi je tedy 1x, vyhledavame dle nazvu domeny
                and Turris.select().where(Turris.remoteHost == remoteHost).count() == 0) # za tri hodiny jsem nezjistil proc, ale update vraci NULU. Ovsem jakmile zavolam pdb, uz vraci korektne treba osmicku. Sigr jeden. Tak sem davam tenhle select, funguje lip. Divne je, ze v nasledujicim radku u IP update vraci korektni cislo. Prehodit jsem je nezkousel.
                    or Turris.update(timestamp=datetime.datetime.now()).where(Turris.ip == ip).execute() == 0): #domena v db nema IP (mozna tam je zaznam s totoznym jmenem a treba jinou IP)  Xdomena ma ip, kazdy je v db zvlast, vyhledavame dle ip
                #if remoteHost == "www.corradorossi.it":
98 99 100 101 102
                    #logging.debug("("+str(threadNumber)+","+str(domainThreadNumber)+") " + "SELECT " + str(Turris.select().count()))
                    #logging.debug("("+str(threadNumber)+","+str(domainThreadNumber)+") " + "SELECT " + str())
                    #logging.debug("("+str(threadNumber)+","+str(domainThreadNumber)+") " + str(Turris.update(timestamp=datetime.datetime.now()).where(Turris.remoteHost == remoteHost).execute()))
                    #logging.debug("("+str(threadNumber)+","+str(domainThreadNumber)+") " + str(Turris.update(timestamp=datetime.datetime.now()).where(Turris.ip == ip).execute()))
                    #logging.debug("("+str(threadNumber)+","+str(domainThreadNumber)+") " + "done")
103
                    #import pdb;pdb.set_trace()
104
                #logging.debug("("+str(threadNumber)+","+str(domainThreadNumber)+") " + "PRIDAVAM")
105 106 107 108 109 110 111 112
                #adresu jsme jeste nepotkali, neni v tabulce turris
                #pridat domenu do turris
                #logging.debug("("+str(threadNumber)+","+str(domainThreadNumber)+") " + "INSERT " + remoteHost)
                Turris.insert(ip=ip, port=80, url=self.websiteDomain, remoteHost=remoteHost).execute() # XXX port muze byt jiny nez 80
                #Db.cur.execute("""INSERT INTO turris (ip, port, url, `evil host`)  VALUES (%s,%s,%s,%s)""", (ip, 80, self.websiteDomain, remoteHost))
                #Db.connection.commit()
            #Db.cur.close()
            else:
113
                #logging.debug("("+str(threadNumber)+","+str(domainThreadNumber)+") " + "NEPRIDAVAM")
114 115 116 117 118 119 120 121
                pass
        #logging.debug("("+str(threadNumber)+","+str(domainThreadNumber)+") " + "ADDRESS "  + " ip: " + str(ip)) #+ str(vote)

        if ip != None:
            #crawlDomainIp.vote = vote
            #kontaktovat externi geoIP sluzbu
            crawlDomainIp.country, crawlDomainIp.city = Domains.ip2countryAndCity(ip)
            # kontaktovat PDNS
122 123
            items = Domains.ip2pdnsDomains(ip)
            if items:
Edvard Rejthar's avatar
Edvard Rejthar committed
124
                pdns.update(items)            
125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
            return None
        else: #zadna ip neni k dispozici, domena je asi propadla, hlas patri jmenu domeny
            #return vote
            return None




##try:
##if ip == None: #domena nema ip, v databazi je tedy 1x, vyhledavame dle nazvu domeny
#Db.cur.execute("""SELECT status from turris JOIN status ON status.id = turris.status WHERE `evil host` = %s LIMIT 1""", (remoteHost,))
##status = Turris.select().join(Status, on=(Status.id == Turris.status)).where(Turris.remoteHost == remoteHost).limit(1).get().status
##else: #domena ma ip, kazdy je v db zvlast, vyhledavame dle ip
#Db.cur.execute("""SELECT status from turris JOIN status ON status.id = turris.status WHERE `ip` = %s LIMIT 1""", (ip,))
##status = Turris.select().join(Status, on=(Status.id == Turris.status)).where(Turris.ip == ip).limit(1).get().status
#res = Db.cur.fetchone()
##vote = Status.int2word(status)

##except Exception: # genericka excepce je spatna, ale prece se z tech importu neze**ru X TurrisDoesNotExist
##    pass
##logging.debug("vote " + str(vote))
#pass
#else: