Commit 437f02e3 authored by Petr Špaček's avatar Petr Špaček

Merge branch 'randomize_query_order' into 'master'

randomize order of queries in domain2ipset.py

See merge request !13
parents 68a8fd68 97efa28e
Pipeline #44421 passed with stages
in 7 minutes and 28 seconds
......@@ -8,9 +8,11 @@ script sends queries to IP addresses from glue.
import collections
import enum
import itertools
import logging
import multiprocessing
import pickle
import random
from typing import Counter, Deque, Dict, Iterable, Set, Tuple
import dns.message
......@@ -154,12 +156,35 @@ def save(domain2nsset, netstats, domain2ipset):
logging.info('%s out of %s domains has at least one working NS (%0.2f %%)',
len(domain2ipset), len(domain2nsset), len(domain2ipset)/len(domain2nsset)*100)
def randomize_iter(iterable, window_len: int):
"""Randomize order of iteration over an iterable using fixed window."""
assert window_len > 0
# buffer up to "window_len" items
window = [True]
while window:
window = list(itertools.islice(iterable, window_len))
random.shuffle(window)
yield from window
def update_mapping(domain2nsset, nsname2ipset, netstats, domain2ipset):
retry_queue = collections.deque() # type: Deque[Tuple[int, dns.name.Name, str]]
#logging.info('computing number of candidates to query')
#candidates = gen_candidates(domain2nsset, nsname2ipset, netstats, retry_queue, domain2ipset)
#logging.info('queue contains %s queries to be checked', count_candidates(candidates))
candidates = gen_candidates(domain2nsset, nsname2ipset, netstats, retry_queue, domain2ipset)
# It is useful to avoid querying the same IP address 100 times in a row
# because we could send all 100 queries in parallel and blocking all
# threads on waiting for (potential) timeout.
#
# With a sufficiently large window we should detect a dead IP address
# before overshooting NetStats.timeouts_in_row limit and limit time wasted on timeouts.
#
# It could also help as a workaround to agressive response rate limiting.
candidates = randomize_iter(
gen_candidates(domain2nsset, nsname2ipset, netstats, retry_queue, domain2ipset),
100000)
with multiprocessing.Pool(processes = 30) as pool:
for attempt, domain, ip, state in pool.imap_unordered(check_availability, candidates):
process_reply(attempt, domain, ip, state, netstats, retry_queue, domain2ipset)
......
......@@ -14,7 +14,7 @@ def repeat_genreport(cycles: int):
logging.info('genreport round {} / {}, output file {}'.format(
cycle, cycles, ednscomp_output.name))
subprocess.run(check=True, stdin=ednscomp_input, stdout=ednscomp_output,
args=['genreport', '-m', '500', '-p'])
args=['genreport', '-m', '500', '-p', '-s'])
ednscomp_input.seek(0)
def check_env():
......@@ -36,4 +36,4 @@ if __name__ == '__main__':
sys.exit(1)
check_env()
repeat_genreport(cycles)
\ No newline at end of file
repeat_genreport(cycles)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment