Commit 23af31a3 authored by Petr Špaček's avatar Petr Špaček

WIP: zone2pickle optimization

parent c56efb1b
Pipeline #48069 passed with stage
in 1 minute and 4 seconds
......@@ -2,7 +2,8 @@
set -o errexit -o xtrace
test -f zone || wget -O zone zone
ldns-read-zone -zc -E SOA -E NS -E A -E AAAA zone > zone.normalized zone.normalized > ednscomp.input
......@@ -2,6 +2,7 @@
Shared load/save methods for intermediate results
import functools
import logging
import pickle
from typing import Dict, Set
......@@ -27,9 +28,8 @@ def load_domain2ipset() -> Dict[, Set[AnIPAddress]]:
return domain2ipset
def save_nsname2ipset(nsname2ipset: Dict[, Set[AnIPAddress]]) -> None:
'''TODO: skip ip_cnt computation if not in verbose mode?'''
if nsname2ipset:
ip_cnt = len(set.union(*nsname2ipset.values()))
ip_cnt = functools.reduce(lambda cnt, ipset: cnt + len(ipset), nsname2ipset.values(), 0)
ip_cnt = 0'pickling NS name -> IP address mapping '
......@@ -36,7 +36,7 @@ Preparation
to speed up further processing. Do not skip this step, missing canonicalization
might cause problems down the road::
$ ldns-read-zone -E SOA -E NS -E A -E AAAA input_zone > zone.nodnssec
$ ldns-read-zone -zc -E SOA -E NS -E A -E AAAA input_zone > zone.nodnssec
Running scan
......@@ -4,6 +4,7 @@ Tranform DNS zone file into pickled Python objects.
import argparse
import collections
import ipaddress
import logging
import pickle
......@@ -15,43 +16,76 @@ import
import dataapi
from ednsevalzone import AnIPAddress
def domain2nsset(zoneobj: -> Dict[, Set[]]:
class NameCache(collections.defaultdict):
TODO: optimize, currently it requires whole zone in single object'''
return {domain: set( for ns in node.find_rdataset(dns.rdataclass.IN, dns.rdatatype.NS))
for domain, node in zoneobj.items()
if node.get_rdataset(dns.rdataclass.IN, dns.rdatatype.NS)}
def uniq_nslist(nssets: Iterable[Set[]]) -> Set[]:
'''TODO: optimize, it could be done together with domain2nsset transformation'''
uniq_ns = set() # type: Set[]
for nsset in nssets:
return uniq_ns
def glue_ns2ipset(nslist: Set[], zoneobj: \
-> Dict[, Set[AnIPAddress]]:
NS names without glue addresses will not show up in output
Dict: str -> is slow and this is used to cache its results
ns2ipset = {} # type: Dict[, Set[AnIPAddress]]
for nsname in nslist:
if nsname in ns2ipset: # optimization, do not redo it
ipset = set() # type: Set[AnIPAddress]
node = zoneobj[nsname]
except KeyError: # glue name not in zone
for rdtype in (dns.rdatatype.A, dns.rdatatype.AAAA):
rrset = node.get_rdataset(dns.rdataclass.IN, rdtype)
if rrset:
ipset = ipset.union(set(ipaddress.ip_address(ip.address)
for ip in rrset))
if ipset: # do not store glueless names
ns2ipset[nsname] = ipset
return ns2ipset
def __missing__(self, name_str):
name =
self[name_str] = name
return name
class ZoneExtractor():
def __init__(self, owner_str: str):
# zone
self.origin =
self.nsnames = set() # type: Set[]
self.nsname2ipset = {} # type: Dict[, Set[AnIPAddress]]
self.domain2nsset = {} # type: Dict[, Set[]]
# data deduplication: temporary dict for singletons
self.nssets = {} # type: Dict[Set[], Set[]]
self.names = NameCache() # type: Dict[str,]
# reader
self.buf_owner_str = owner_str
self.buf_owner =
self.buf_rdtype_str = 'SOA'
self.buf_rdtexts = [] # type: List[str]
def finish_rrset(self):
if self.buf_rdtype_str == 'NS':
nsset = frozenset(self.names[nsname] for nsname in self.buf_rdtexts)
# use singleton frozenset to save memory, i.e. keep only single copy of frozensents with the same hash
nsset = self.nssets.setdefault(nsset, nsset)
self.domain2nsset[self.buf_owner] = nsset
elif self.buf_rdtype_str == 'ipaddr':
ipset = frozenset(ipaddress.ip_address(ipaddr) for ipaddr in self.buf_rdtexts)
self.nsname2ipset[self.buf_owner] = ipset
elif self.buf_rdtype_str == 'SOA':
pass # ignore
raise NotImplementedError('rdtype %s not supported' % self.buf_rdtype_str)
self.buf_rdtexts = []
def consume_line(self, line_owner_str, line_rdtype, line_rdtext):
if line_rdtype == 'A' or line_rdtype == 'AAAA':
line_rdtype = 'ipaddr' # do not treat A/AAAA differently
#print(self.buf_owner_str, self.buf_rdtype_str, self.buf_rdtexts)
#print(line_owner_str, line_rdtype, line_rdtext)
if self.buf_owner_str != line_owner_str:
self.buf_owner_str = line_owner_str
# cache parsed owner name
self.buf_owner = self.names[line_owner_str]
elif self.buf_rdtype_str != line_rdtype:
# do not call finish twice!
self.buf_rdtype_str = line_rdtype
# now buffer compatible data types
def _read_soa(zone_file: TextIO) -> str:
line = zone_file.readline()
owner, ttl, rdclass, rdtype, rdtext = line.split('\t', 5)
assert(rdclass == 'IN')
assert(rdtype == 'SOA')
return owner
def convert(zone_file: TextIO, zone_origin: -> Tuple[ \
Dict[, Set[]], \
......@@ -59,17 +93,24 @@ def convert(zone_file: TextIO, zone_origin: -> Tuple[ \
Dict[, Set[AnIPAddress]]]:
convert text zone into set of pickle files with preprocessed metadata
only zone file canonicalized using this command is acceptable:
ldns-read-zone -zc -E SOA -E NS -E A -E AAAA
''''loading zone file')
zone =, origin=zone_origin, relativize=False)'loading and processing zone file')
#input_size = os.fstat(zone_file.fileno()).st_size'determining list of unique NS names')
nslist = uniq_nslist(domain2nsset(zone).values())
# read SOA
origin_str = _read_soa(zone_file)
zoneext = ZoneExtractor(origin_str)
assert(zone_origin == zoneext.origin)'computing NS name -> IP address mapping from zone glue data')
nsname2ipset = glue_ns2ipset(nslist, zone)
for line in zone_file:
owner_str, ttl, rdclass, rdtype, rdata = line.split('\t', 5)
zoneext.consume_line(owner_str, rdtype, rdata.strip())
return domain2nsset(zone), nslist, nsname2ipset
return zoneext.domain2nsset, zoneext.nsnames, zoneext.nsname2ipset
def save(domain_nsset, nslist, nsname2ipset):
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment