Commit 1bc09711 authored by Tony Finch's avatar Tony Finch Committed by Daniel Salzman

trie: support for copy-on-write transactions

A COW transaction allows a trie to be used for reading concurrently
while a modified version of the trie is being prepared. The change
can be committed by swapping the new trie root in place of the old one.

Internally, this feature uses one bit reference counts to identify which
parts of the trie are shared between the old and new versions, which
parts are new-only (so can be mutated) and which parts are old-only
(and will be free()d after commit).
parent e7b7e187
......@@ -441,6 +441,7 @@ tests/contrib/test_dynarray.c
tests/contrib/test_heap.c
tests/contrib/test_net.c
tests/contrib/test_net_shortwrite.c
tests/contrib/test_qp-cow.c
tests/contrib/test_qp-trie.c
tests/contrib/test_siphash.c
tests/contrib/test_sockaddr.c
......
This diff is collapsed.
/* Copyright (C) 2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
/* Copyright (C) 2018 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
Copyright (C) 2018 Tony Finch <dot@dotat.at>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
......@@ -39,6 +40,20 @@ typedef struct trie trie_t;
/*! \brief Opaque type for holding a QP-trie iterator. */
typedef struct trie_it trie_it_t;
/*! \brief Callback for performing actions on a trie leaf
*
* Used during copy-on-write transactions
*
* \param val The value of the element to be altered
* \param key The key of the element to be altered
* \param len The length of key
* \param d Additional user data
*/
typedef void trie_cb(trie_val_t val, const char *key, size_t len, void *d);
/*! \brief Opaque type for holding the copy-on-write state for a QP-trie. */
typedef struct trie_cow trie_cow_t;
/*! \brief Create a trie instance. */
trie_t* trie_create(knot_mm_t *mm);
......@@ -110,3 +125,93 @@ const char* trie_it_key(trie_it_t *it, size_t *len);
/*! \brief Return pointer to the value of the current element (writable). */
trie_val_t* trie_it_val(trie_it_t *it);
/*! \brief Start a COW transaction
*
* A copy-on-write transaction starts by obtaining a write lock (in
* your application code) followed by a call to trie_cow(). This
* creates a shared clone of the trie and saves both old and new roots
* in the COW context.
*
* During the COW transaction, you call trie_cow_ins() or
* trie_cow_del() as necessary. These calls ensure that the relevant
* parts of the (new) trie are copied so that they can be modified
* freely.
*
* Your trie_val_t objects must be able to distinguish their
* reachability, either shared, or old-only, or new-only. Before a COW
* transaction the reachability of your objects is indeterminate.
* During a transaction, any trie_val_t objects that might be affected
* (because they are adjacent to a trie_get_cow() or trie_del_cow())
* are first marked as shared using the callback you pass to
* trie_cow().
*
* When the transaction is complete, to commit, call trie_cow_new() to
* get the new root, swap the old and new trie roots (e.g. with
* rcu_xchg_pointer()), wait for readers to finish with the old trie
* (e.g. using synchronize_rcu()), then call trie_cow_commit(). For a
* rollback, you can just call trie_cow_rollback() without waiting
* since that doesn't conflict with readers. After trie_cow_commit()
* or trie_cow_rollback() have finished, you can release your write
* lock.
*
* Concurrent reading of the old trie is allowed during a transaction
* provided that it is known when all readers have finished with the
* old version, e.g. using rcu_read_lock() and rcu_read_unlock().
* There must be only one write transaction at a time.
*
* \param old the old trie
* \param mark_shared callback to mark a leaf as shared
* \param d extra data for the callback
* \return a pointer to a COW context,
* or NULL if there was a failure
*/
trie_cow_t* trie_cow(trie_t *old, trie_cb *mark_shared, void *d);
/*! \brief get the new trie from a COW context */
trie_t* trie_cow_new(trie_cow_t *cow);
/*! \brief variant of trie_get_ins() for use during COW transactions
*
* As necessary, this copies path from the root of the trie to the
* leaf, so that it is no longer shared. Any leaves adjacent to this
* path are marked as shared using the mark_shared callback passed to
* trie_cow().
*
* It is your responsibility to COW your trie_val_t objects. If you copy an
* object you must change the original's reachability from shared to old-only.
* New objects (including copies) must have new-only reachability.
*/
trie_val_t* trie_get_cow(trie_cow_t *cow, const char *key, uint32_t len);
/*!
* \brief variant of trie_del() for use during COW transactions
*
* The mark_shared callback is invoked as necessary, in the same way
* as trie_get_cow().
*
* Returns KNOT_EOK if the key was removed or KNOT_ENOENT if not found.
* If val!=NULL and deletion succeeded, the *val is set to the deleted
* value pointer.
*/
int trie_del_cow(trie_cow_t *cow, const char *key, uint32_t len, trie_val_t *val);
/*! \brief clean up the old trie after committing a COW transaction
*
* Your callback is invoked for any trie_val_t objects that might need
* cleaning up; you must free any objects you have marked as old-only
* and retain objects with shared reachability.
*
* The cow object is free()d, and the new trie root is returned.
*/
trie_t* trie_cow_commit(trie_cow_t *cow, trie_cb *cb, void *d);
/*! \brief clean up the new trie after rolling back a COW transaction
*
* Your callback is invoked for any trie_val_t objects that might need
* cleaning up; you must free any objects you have marked as new-only
* and retain objects with shared reachability.
*
* The cow object is free()d, and the old trie root is returned.
*/
trie_t* trie_cow_rollback(trie_cow_t *cow, trie_cb *cb, void *d);
......@@ -7,6 +7,7 @@
/contrib/test_heap
/contrib/test_net
/contrib/test_net_shortwrite
/contrib/test_qp-cow
/contrib/test_qp-trie
/contrib/test_siphash
/contrib/test_sockaddr
......
......@@ -50,6 +50,7 @@ check_PROGRAMS = \
contrib/test_net \
contrib/test_net_shortwrite \
contrib/test_qp-trie \
contrib/test_qp-cow \
contrib/test_siphash \
contrib/test_sockaddr \
contrib/test_string \
......
/* Copyright (C) 2018 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
Copyright (C) 2018 Tony Finch <dot@dotat.at>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <err.h>
#include <unistd.h>
#include "contrib/qp-trie/trie.h"
#include "contrib/macros.h"
#include "contrib/string.h"
#include "libknot/errcode.h"
#include "tap/basic.h"
/* Constants. */
#define MAX_KEYLEN 64
#define MAX_LEAVES 12345
#define MAX_MUTATIONS 123
#define MAX_TRANSACTIONS 1234
enum cowstate {
cow_absent, // not in trie
cow_unmarked,
cow_shared,
cow_old, // deleted from new trie
cow_new, // added to new trie
deadbeef,
};
struct cowleaf {
char *key;
size_t len;
int cowstate;
};
static inline size_t
prng(size_t max) {
/* good enough these days */
return (size_t)rand() % max;
}
static struct cowleaf *
grow_leaves(size_t maxlen, size_t leaves)
{
struct cowleaf *leaf = bcalloc(leaves, sizeof(*leaf));
trie_t *trie = trie_create(NULL);
if (!trie) sysbail("trie_create");
for (size_t i = 0; i < leaves; i++) {
trie_val_t *valp;
char *str = NULL;
size_t len = 0;
do {
free(str);
len = prng(maxlen);
str = bmalloc(len + 1);
for (size_t j = 0; j < len; j++)
str[j] = "0123456789"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
[prng(62)];
str[len] = '\0';
valp = trie_get_ins(trie, str, (uint32_t)len);
if (!valp) bail("trie_get_ins");
} while (*valp != NULL);
*valp = &leaf[i];
leaf[i].key = str;
leaf[i].len = len;
leaf[i].cowstate = cow_absent;
}
trie_free(trie);
return (leaf);
}
static void
dead_leaves(struct cowleaf *leaf, size_t leaves)
{
for (size_t i = 0; i < leaves; i++)
free(leaf[i].key);
free(leaf);
}
static void
mark_cb(trie_val_t val, const char *key, size_t len, void *d)
{
struct cowleaf *leaf = val;
assert(leaf->cowstate == cow_unmarked &&
"leaf should go from unmarked to shared exactly once");
leaf->cowstate = cow_shared;
(void)key;
(void)len;
(void)d;
}
static void
commit_rollback(trie_val_t val, const char *key, size_t len, void *d)
{
struct cowleaf *leaf = val;
int *commit = d;
if (*commit)
assert((leaf->cowstate == cow_shared ||
leaf->cowstate == cow_old) &&
"committing deletes from old trie");
else
assert((leaf->cowstate == cow_shared ||
leaf->cowstate == cow_new) &&
"roll back deletes from new trie");
if (leaf->cowstate != cow_shared)
leaf->cowstate = deadbeef;
(void)key;
(void)len;
}
static void
del_cow(trie_cow_t *x, struct cowleaf *leaf)
{
trie_val_t val;
assert(KNOT_EOK == trie_del_cow(x,
leaf->key,
(uint32_t)leaf->len,
&val));
assert(val == leaf);
}
static void
usage(void) {
fprintf(stderr,
"usage: test_qp-cow [-k N] [-l N] [-t N]\n"
" -k N maximum key length (default %d)\n"
" -l N number of leaves (default %d)\n"
" -m N mutations per transaction (default %d)\n"
" -t N number of transactions (default %d)\n",
MAX_KEYLEN,
MAX_LEAVES,
MAX_MUTATIONS,
MAX_TRANSACTIONS);
exit(1);
}
int
main(int argc, char *argv[])
{
size_t keylen = MAX_KEYLEN;
size_t leaves = MAX_LEAVES;
int mutations = MAX_MUTATIONS;
int transactions = MAX_TRANSACTIONS;
int opt;
while ((opt = getopt(argc, argv, "k:l:m:t:h")) != -1)
switch (opt) {
case('k'):
keylen = (unsigned)atoi(optarg);
continue;
case('l'):
leaves = (unsigned)atoi(optarg);
continue;
case('m'):
mutations = atoi(optarg);
continue;
case('t'):
transactions = atoi(optarg);
continue;
default:
usage();
}
if (argc != optind)
usage();
plan(transactions);
struct cowleaf *leaf = grow_leaves(keylen, leaves);
trie_t *t = trie_create(NULL);
for (int round = 0; round < transactions; round++) {
trie_cow_t *x = trie_cow(t, mark_cb, NULL);
if (!x) sysbail("trie_cow");
int hits = prng(mutations);
for (int hit = 0; hit < hits; hit++) {
size_t i = prng(leaves);
switch (leaf[i].cowstate) {
case(cow_absent): {
trie_val_t *val =
trie_get_cow(x,
leaf[i].key,
(uint32_t)leaf[i].len);
if (!val) sysbail("trie_get_cow");
assert(*val == NULL && "new leaf");
*val = &leaf[i];
leaf[i].cowstate = cow_new;
} break;
case(cow_unmarked): {
del_cow(x, &leaf[i]);
assert(leaf[i].cowstate == cow_shared &&
"state changed unmarked -> shared");
leaf[i].cowstate = cow_old;
} break;
case(cow_shared): {
del_cow(x, &leaf[i]);
assert(leaf[i].cowstate == cow_shared &&
"state remained shared");
leaf[i].cowstate = cow_old;
} break;
case(cow_new): {
del_cow(x, &leaf[i]);
assert(leaf[i].cowstate == cow_new &&
"state remained new");
leaf[i].cowstate = cow_absent;
} break;
case(cow_old): {
// don't want to mess with old tree
} break;
case(deadbeef): {
assert(!"deadbeef should not be possible");
} break;
default:
assert(!"bug - unhandled state");
}
}
int commit = !prng(2);
if (commit)
t = trie_cow_commit(x, commit_rollback, &commit);
else
t = trie_cow_rollback(x, commit_rollback, &commit);
trie_it_t *it = trie_it_begin(t);
while (!trie_it_finished(it)) {
trie_val_t *val = trie_it_val(it);
assert(val != NULL);
struct cowleaf *l = *val;
if (commit)
assert((l->cowstate == cow_unmarked ||
l->cowstate == cow_shared ||
l->cowstate == cow_new) &&
"committing expected state");
else
assert((l->cowstate == cow_unmarked ||
l->cowstate == cow_shared ||
l->cowstate == cow_old) &&
"roll back expected state");
l->cowstate = cow_unmarked;
trie_it_next(it);
}
trie_it_free(it);
for (size_t i = 0; i < leaves; i++) {
assert((leaf[i].cowstate == cow_unmarked ||
leaf[i].cowstate == cow_absent ||
leaf[i].cowstate == deadbeef) &&
"cleanup leaves either unmarked or dead");
if (leaf[i].cowstate == deadbeef)
leaf[i].cowstate = cow_absent;
}
ok(1, "transaction done");
}
trie_free(t);
dead_leaves(leaf, leaves);
return 0;
}
/* Copyright (C) 2016 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
/* Copyright (C) 2018 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment