Commit 8353e389 authored by Petr Špaček's avatar Petr Špaček

randomize order of queries in domain2ipset.py

Randomization should help with timeouts and response rate limiting.
parent 68a8fd68
......@@ -8,9 +8,11 @@ script sends queries to IP addresses from glue.
import collections
import enum
import itertools
import logging
import multiprocessing
import pickle
import random
from typing import Counter, Deque, Dict, Iterable, Set, Tuple
import dns.message
......@@ -154,12 +156,35 @@ def save(domain2nsset, netstats, domain2ipset):
logging.info('%s out of %s domains has at least one working NS (%0.2f %%)',
len(domain2ipset), len(domain2nsset), len(domain2ipset)/len(domain2nsset)*100)
def randomize_iter(iterable, window_len: int):
"""Randomize order of iteration over an iterable using fixed window."""
assert window_len > 0
# buffer up to "window_len" items
window = [True]
while window:
window = list(itertools.islice(iterable, window_len))
random.shuffle(window)
yield from window
def update_mapping(domain2nsset, nsname2ipset, netstats, domain2ipset):
retry_queue = collections.deque() # type: Deque[Tuple[int, dns.name.Name, str]]
#logging.info('computing number of candidates to query')
#candidates = gen_candidates(domain2nsset, nsname2ipset, netstats, retry_queue, domain2ipset)
#logging.info('queue contains %s queries to be checked', count_candidates(candidates))
candidates = gen_candidates(domain2nsset, nsname2ipset, netstats, retry_queue, domain2ipset)
# It is useful to avoid querying the same IP address 100 times in a row
# because we could send all 100 queries in parallel and blocking all
# threads on waiting for (potential) timeout.
#
# With a sufficiently large window we should detect a dead IP address
# before overshooting NetStats.timeouts_in_row limit and limit time wasted on timeouts.
#
# It could also help as a workaround to agressive response rate limiting.
candidates = randomize_iter(
gen_candidates(domain2nsset, nsname2ipset, netstats, retry_queue, domain2ipset),
100000)
with multiprocessing.Pool(processes = 30) as pool:
for attempt, domain, ip, state in pool.imap_unordered(check_availability, candidates):
process_reply(attempt, domain, ip, state, netstats, retry_queue, domain2ipset)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment