Commit 2ef686c4 authored by Petr Špaček's avatar Petr Špaček

WIP: zone2pickle optimization

parent c56efb1b
......@@ -2,7 +2,8 @@
set -o errexit -o xtrace
test -f zone || wget -O zone https://www.internic.net/domain/in-addr.arpa
zone2pickle.py zone in-addr.arpa
ldns-read-zone -zc -E SOA -E NS -E A -E AAAA zone > zone.normalized
zone2pickle.py zone.normalized in-addr.arpa
nsname2ipset.py
domain2ipset.py
genednscomp.py > ednscomp.input
......
......@@ -2,6 +2,7 @@
Shared load/save methods for intermediate results
'''
import functools
import logging
import pickle
from typing import Dict, Set
......@@ -14,9 +15,9 @@ def load_nsname2ipset() -> Dict[dns.name.Name, Set[AnIPAddress]]:
logging.info('loading NS name -> IP address mapping')
with open('nsname2ipset.pickle', 'rb') as nsname2ipset_pickle:
nsname2ipset = pickle.load(nsname2ipset_pickle)
ip_all = set.union(*nsname2ipset.values())
ip_cnt = functools.reduce(lambda cnt, ipset: cnt + len(ipset), nsname2ipset.values(), 0)
logging.info('loaded %s unique IP addresses for %s NS names',
len(ip_all), len(nsname2ipset))
ip_cnt, len(nsname2ipset))
return nsname2ipset
def load_domain2ipset() -> Dict[dns.name.Name, Set[AnIPAddress]]:
......@@ -27,9 +28,8 @@ def load_domain2ipset() -> Dict[dns.name.Name, Set[AnIPAddress]]:
return domain2ipset
def save_nsname2ipset(nsname2ipset: Dict[dns.name.Name, Set[AnIPAddress]]) -> None:
'''TODO: skip ip_cnt computation if not in verbose mode?'''
if nsname2ipset:
ip_cnt = len(set.union(*nsname2ipset.values()))
ip_cnt = functools.reduce(lambda cnt, ipset: cnt + len(ipset), nsname2ipset.values(), 0)
else:
ip_cnt = 0
logging.info('pickling NS name -> IP address mapping '
......
......@@ -36,7 +36,7 @@ Preparation
to speed up further processing. Do not skip this step, missing canonicalization
might cause problems down the road::
$ ldns-read-zone -E SOA -E NS -E A -E AAAA input_zone > zone.nodnssec
$ ldns-read-zone -zc -E SOA -E NS -E A -E AAAA input_zone > zone.nodnssec
Running scan
......
......@@ -4,6 +4,7 @@ Tranform DNS zone file into pickled Python objects.
"""
import argparse
import collections
import ipaddress
import logging
import pickle
......@@ -15,43 +16,76 @@ import dns.zone
import dataapi
from ednsevalzone import AnIPAddress
def domain2nsset(zoneobj: dns.zone.Zone) -> Dict[dns.name.Name, Set[dns.name.Name]]:
class NameCache(collections.defaultdict):
'''
generator!
TODO: optimize, currently it requires whole zone in single object'''
return {domain: set(ns.target for ns in node.find_rdataset(dns.rdataclass.IN, dns.rdatatype.NS))
for domain, node in zoneobj.items()
if node.get_rdataset(dns.rdataclass.IN, dns.rdatatype.NS)}
def uniq_nslist(nssets: Iterable[Set[dns.name.Name]]) -> Set[dns.name.Name]:
'''TODO: optimize, it could be done together with domain2nsset transformation'''
uniq_ns = set() # type: Set[dns.name.Name]
for nsset in nssets:
uniq_ns.update(nsset)
return uniq_ns
def glue_ns2ipset(nslist: Set[dns.name.Name], zoneobj: dns.zone.Zone) \
-> Dict[dns.name.Name, Set[AnIPAddress]]:
'''
NS names without glue addresses will not show up in output
Dict: str -> dns.name.Name
dns.name.from_text() is slow and this is used to cache its results
'''
ns2ipset = {} # type: Dict[dns.name.Name, Set[AnIPAddress]]
for nsname in nslist:
if nsname in ns2ipset: # optimization, do not redo it
continue
ipset = set() # type: Set[AnIPAddress]
try:
node = zoneobj[nsname]
except KeyError: # glue name not in zone
continue
for rdtype in (dns.rdatatype.A, dns.rdatatype.AAAA):
rrset = node.get_rdataset(dns.rdataclass.IN, rdtype)
if rrset:
ipset = ipset.union(set(ipaddress.ip_address(ip.address)
for ip in rrset))
if ipset: # do not store glueless names
ns2ipset[nsname] = ipset
return ns2ipset
def __missing__(self, name_str):
name = dns.name.from_text(name_str)
self[name_str] = name
return name
class ZoneExtractor():
def __init__(self, owner_str: str):
# zone
self.origin = dns.name.from_text(owner_str)
self.nsnames = set() # type: Set[dns.name.Name]
self.nsname2ipset = {} # type: Dict[dns.name.Name, Set[AnIPAddress]]
self.domain2nsset = {} # type: Dict[dns.name.Name, Set[dns.name.Name]]
# data deduplication: temporary dict for singletons
self.nssets = {} # type: Dict[Set[dns.name.Name], Set[dns.name.Name]]
self.names = NameCache() # type: Dict[str, dns.name.Name]
# reader
self.buf_owner_str = owner_str
self.buf_owner = dns.name.from_text(owner_str)
self.buf_rdtype_str = 'SOA'
self.buf_rdtexts = [] # type: List[str]
def finish_rrset(self):
if self.buf_rdtype_str == 'NS':
nsset = frozenset(self.names[nsname] for nsname in self.buf_rdtexts)
# use singleton frozenset to save memory, i.e. keep only single copy of frozensents with the same hash
nsset = self.nssets.setdefault(nsset, nsset)
self.domain2nsset[self.buf_owner] = nsset
self.nsnames.update(nsset)
elif self.buf_rdtype_str == 'ipaddr':
ipset = frozenset(ipaddress.ip_address(ipaddr) for ipaddr in self.buf_rdtexts)
self.nsname2ipset[self.buf_owner] = ipset
elif self.buf_rdtype_str == 'SOA':
pass # ignore
else:
raise NotImplementedError('rdtype %s not supported' % self.buf_rdtype_str)
self.buf_rdtexts = []
def consume_line(self, line_owner_str, line_rdtype, line_rdtext):
if line_rdtype == 'A' or line_rdtype == 'AAAA':
line_rdtype = 'ipaddr' # do not treat A/AAAA differently
#print(self.buf_owner_str, self.buf_rdtype_str, self.buf_rdtexts)
#print(line_owner_str, line_rdtype, line_rdtext)
#set_trace()
if self.buf_owner_str != line_owner_str:
self.finish_rrset()
self.buf_owner_str = line_owner_str
# cache parsed owner name
self.buf_owner = self.names[line_owner_str]
elif self.buf_rdtype_str != line_rdtype:
# do not call finish twice!
self.finish_rrset()
self.buf_rdtype_str = line_rdtype
# now buffer compatible data types
self.buf_rdtexts.append(line_rdtext)
def _read_soa(zone_file: TextIO) -> str:
line = zone_file.readline()
owner, ttl, rdclass, rdtype, rdtext = line.split('\t', 5)
assert(rdclass == 'IN')
assert(rdtype == 'SOA')
return owner
def convert(zone_file: TextIO, zone_origin: dns.name.Name) -> Tuple[ \
Dict[dns.name.Name, Set[dns.name.Name]], \
......@@ -59,17 +93,24 @@ def convert(zone_file: TextIO, zone_origin: dns.name.Name) -> Tuple[ \
Dict[dns.name.Name, Set[AnIPAddress]]]:
'''
convert text zone into set of pickle files with preprocessed metadata
only zone file canonicalized using this command is acceptable:
ldns-read-zone -zc -E SOA -E NS -E A -E AAAA
'''
logging.info('loading zone file')
zone = dns.zone.from_file(zone_file, origin=zone_origin, relativize=False)
logging.info('loading and processing zone file')
#input_size = os.fstat(zone_file.fileno()).st_size
logging.info('determining list of unique NS names')
nslist = uniq_nslist(domain2nsset(zone).values())
# read SOA
origin_str = _read_soa(zone_file)
zoneext = ZoneExtractor(origin_str)
assert(zone_origin == zoneext.origin)
logging.info('computing NS name -> IP address mapping from zone glue data')
nsname2ipset = glue_ns2ipset(nslist, zone)
for line in zone_file:
owner_str, ttl, rdclass, rdtype, rdata = line.split('\t', 5)
zoneext.consume_line(owner_str, rdtype, rdata.strip())
zoneext.finish_rrset()
return domain2nsset(zone), nslist, nsname2ipset
return zoneext.domain2nsset, zoneext.nsnames, zoneext.nsname2ipset
def save(domain_nsset, nslist, nsname2ipset):
'''
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment