X-Git-Url: https://git.ozlabs.org/?p=ccan;a=blobdiff_plain;f=ccan%2Ftdb2%2Fdoc%2Fdesign.lyx;h=bb499482e5c077385fadfc1b82ed7c14577bedb5;hp=51378c3372ecbe7e17756d1749fe7a4373db68a7;hb=a42bba8ec446284256a7c9146ba3525404de474c;hpb=39f01834db9b6a21d076e67d1e3143ab99aaf43e diff --git a/ccan/tdb2/doc/design.lyx b/ccan/tdb2/doc/design.lyx index 51378c33..bb499482 100644 --- a/ccan/tdb2/doc/design.lyx +++ b/ccan/tdb2/doc/design.lyx @@ -1,4 +1,4 @@ -#LyX 1.6.5 created this file. For more info see http://www.lyx.org/ +#LyX 1.6.7 created this file. For more info see http://www.lyx.org/ \lyxformat 345 \begin_document \begin_header @@ -35,7 +35,7 @@ \paperpagestyle default \tracking_changes true \output_changes true -\author "" +\author "Rusty Russell,,," \author "" \end_header @@ -50,7 +50,7 @@ Rusty Russell, IBM Corporation \end_layout \begin_layout Date -26-July-2010 +1-December-2010 \end_layout \begin_layout Abstract @@ -470,6 +470,13 @@ The tdb_open() call was expanded to tdb_open_ex(), which added an optional \begin_layout Subsubsection Proposed Solution +\begin_inset CommandInset label +LatexCommand label +name "attributes" + +\end_inset + + \end_layout \begin_layout Standard @@ -573,6 +580,14 @@ This allows future attributes to be added, even if this expands the size of the union. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + \begin_layout Subsection tdb_traverse Makes Impossible Guarantees \end_layout @@ -614,6 +629,16 @@ Abandon the guarantee. You can prevent changes by using a transaction or the locking API. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. + Delete-during-traverse will still delete every record, too (assuming no + other changes). +\end_layout + \begin_layout Subsection Nesting of Transactions Is Fraught \end_layout @@ -668,6 +693,14 @@ least-surprise -obscure case. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete; nesting flag is still defined as per tdb1. +\end_layout + \begin_layout Subsection Incorrect Hash Function is Not Detected \end_layout @@ -689,6 +722,14 @@ The header should contain an example hash result (eg. hash function produces the same answer, or fail the tdb_open call. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + \begin_layout Subsection tdb_set_max_dead/TDB_VOLATILE Expose Implementation \end_layout @@ -733,6 +774,16 @@ With the scalability problems of the freelist solved, this API can be removed. tuning, but initially will become a no-op. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete. + TDB_VOLATILE still defined, but implementation should fail on unknown flags + to be future-proof. +\end_layout + \begin_layout Subsection \begin_inset CommandInset label LatexCommand label @@ -785,6 +836,14 @@ I do not see benefit in an additional tdb_open flag to indicate whether an API. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete. +\end_layout + \begin_layout Subsection TDB API Is Not POSIX Thread-safe \end_layout @@ -829,6 +888,22 @@ Internal locking is required to make sure that fcntl locks do not overlap \begin_layout Standard The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe version of the library, and otherwise no overhead will exist. + Alternatively, a hooking mechanism similar to that proposed for +\begin_inset CommandInset ref +LatexCommand ref +reference "Proposed-Solution-locking-hook" + +\end_inset + + could be used to enable pthread locking at runtime. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete. \end_layout \begin_layout Subsection @@ -946,6 +1021,14 @@ This is flexible enough to handle any potential locking scenario, even when It also keeps the complexity out of the API, and in ctdbd where it is needed. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete. +\end_layout + \begin_layout Subsection tdb_chainlock Functions Expose Implementation \end_layout @@ -1027,6 +1110,14 @@ It may be possible to make this race-free in some implementations by having \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete. +\end_layout + \begin_layout Subsection The API Uses Gratuitous Typedefs, Capitals \end_layout @@ -1103,6 +1194,14 @@ It should simply take an extra argument, since we are prepared to break the API/ABI. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + \begin_layout Subsection Various Callback Functions Are Not Typesafe \end_layout @@ -1142,6 +1241,14 @@ With careful use of macros, we can create callback functions which give See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete. +\end_layout + \begin_layout Subsection TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic \end_layout @@ -1179,6 +1286,157 @@ reference "TDB_CLEAR_IF_FIRST-Imposes-Performance" . \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete, TDB_CLEAR_IF_FIRST still defined, but not implemented. +\end_layout + +\begin_layout Subsection +Extending The Header Is Difficult +\end_layout + +\begin_layout Standard +We have reserved (zeroed) words in the TDB header, which can be used for + future features. + If the future features are compulsory, the version number must be updated + to prevent old code from accessing the database. + But if the future feature is optional, we have no way of telling if older + code is accessing the database or not. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The header should contain a +\begin_inset Quotes eld +\end_inset + +format variant +\begin_inset Quotes erd +\end_inset + + value (64-bit). + This is divided into two 32-bit parts: +\end_layout + +\begin_layout Enumerate +The lower part reflects the format variant understood by code accessing + the database. +\end_layout + +\begin_layout Enumerate +The upper part reflects the format variant you must understand to write + to the database (otherwise you can only open for reading). +\end_layout + +\begin_layout Standard +The latter field can only be written at creation time, the former should + be written under the OPEN_LOCK when opening the database for writing, if + the variant of the code is lower than the current lowest variant. +\end_layout + +\begin_layout Standard +This should allow backwards-compatible features to be added, and detection + if older code (which doesn't understand the feature) writes to the database. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete. +\end_layout + +\begin_layout Subsection +Record Headers Are Not Expandible +\end_layout + +\begin_layout Standard +If we later want to add (say) checksums on keys and data, it would require + another format change, which we'd like to avoid. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +We often have extra padding at the tail of a record. + If we ensure that the first byte (if any) of this padding is zero, we will + have a way for future changes to detect code which doesn't understand a + new format: the new code would write (say) a 1 at the tail, and thus if + there is no tail or the first byte is 0, we would know the extension is + not present on that record. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete. +\end_layout + +\begin_layout Subsection +TDB Does Not Use Talloc +\end_layout + +\begin_layout Standard +Many users of TDB (particularly Samba) use the talloc allocator, and thus + have to wrap TDB in a talloc context to use it conveniently. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The allocation within TDB is not complicated enough to justify the use of + talloc, and I am reluctant to force another (excellent) library on TDB + users. + Nonetheless a compromise is possible. + An attribute (see +\begin_inset CommandInset ref +LatexCommand ref +reference "attributes" + +\end_inset + +) can be added later to tdb_open() to provide an alternate allocation mechanism, + specifically for talloc but usable by any other allocator (which would + ignore the +\begin_inset Quotes eld +\end_inset + +context +\begin_inset Quotes erd +\end_inset + + argument). +\end_layout + +\begin_layout Standard +This would form a talloc heirarchy as expected, but the caller would still + have to attach a destructor to the tdb context returned from tdb_open to + close it. + All TDB_DATA fields would be children of the tdb_context, and the caller + would still have to manage them (using talloc_free() or talloc_steal()). +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Deferred. +\end_layout + \begin_layout Section Performance And Scalability Issues \end_layout @@ -1234,6 +1492,14 @@ Remove the flag. point. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing. +\end_layout + \begin_layout Subsection TDB Files Have a 4G Limit \end_layout @@ -1281,6 +1547,14 @@ Old versions of tdb will fail to open the new TDB files (since 28 August be erased and initialized as a fresh tdb!) \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + \begin_layout Subsection TDB Records Have a 4G Limit \end_layout @@ -1310,6 +1584,14 @@ reference "sub:Records-Incur-A" ). \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + \begin_layout Subsection Hash Size Is Determined At TDB Creation Time \end_layout @@ -1324,6 +1606,12 @@ TDB contains a number of hash chains in the header; the number is specified \end_layout \begin_layout Subsubsection +\begin_inset CommandInset label +LatexCommand label +name "sub:Hash-Size-Solution" + +\end_inset + Proposed Solution \end_layout @@ -1342,42 +1630,39 @@ http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoyin , it became clear that it is hard to beat a straight linear hash table which doubles in size when it reaches saturation. - There are three details which become important: -\end_layout - -\begin_layout Enumerate -On encountering a full bucket, we use the next bucket. + Unfortunately, altering the hash table introduces serious locking complications +: the entire hash table needs to be locked to enlarge the hash table, and + others might be holding locks. + Particularly insidious are insertions done under tdb_chainlock. \end_layout -\begin_layout Enumerate -Extra hash bits are stored with the offset, to reduce comparisons. -\end_layout - -\begin_layout Enumerate -A marker entry is used on deleting an entry. +\begin_layout Standard +Thus an expanding layered hash will be used: an array of hash groups, with + each hash group exploding into pointers to lower hash groups once it fills, + turning into a hash tree. + This has implications for locking: we must lock the entire group in case + we need to expand it, yet we don't know how deep the tree is at that point. \end_layout \begin_layout Standard -The doubling of the table must be done under a transaction; we will not - reduce it on deletion, so it will be an unusual case. - It will either be placed at the head (other entries will be moved out the - way so we can expand). - We could have a pointer in the header to the current hashtable location, - but that pointer would have to be read frequently to check for hashtable - moves. +Note that bits from the hash table entries should be stolen to hold more + hash bits to reduce the penalty of collisions. + We can use the otherwise-unused lower 3 bits. + If we limit the size of the database to 64 exabytes, we can use the top + 8 bits of the hash entry as well. + These 11 bits would reduce false positives down to 1 in 2000 which is more + than we need: we can use one of the bits to indicate that the extra hash + bits are valid. + This means we can choose not to re-hash all entries when we expand a hash + group; simply use the next bits we need and mark them invalid. \end_layout -\begin_layout Standard -The locking for this is slightly more complex than the chained case; we - currently have one lock per bucket, and that means we would need to expand - the lock if we overflow to the next bucket. - The frequency of such collisions will effect our locking heuristics: we - can always lock more buckets than we need. +\begin_layout Subsubsection +Status \end_layout \begin_layout Standard -One possible optimization is to only re-check the hash size on an insert - or a lookup miss. +Complete. \end_layout \begin_layout Subsection @@ -1520,6 +1805,14 @@ The free list must be split to reduce contention. we can use far fewer, say 1/32 of the number of hash buckets. \end_layout +\begin_layout Standard +It seems tempting to try to reuse the hash implementation which we use for + records here, but we have two ways of searching for free entries: for allocatio +n we search by size (and possibly zone) which produces too many clashes + for our hash table to handle well, and for coalescing we search by address. + Thus an array of doubly-linked free lists seems preferable. +\end_layout + \begin_layout Standard There are various benefits in using per-size free lists (see \begin_inset CommandInset ref @@ -1531,24 +1824,28 @@ reference "sub:TDB-Becomes-Fragmented" ) but it's not clear this would reduce contention in the common case where all processes are allocating/freeing the same size. Thus we almost certainly need to divide in other ways: the most obvious - is to divide the file into zones, and using a free list (or set of free + is to divide the file into zones, and using a free list (or table of free lists) for each. This approximates address ordering. \end_layout \begin_layout Standard -Note that this means we need to split the free lists when we expand the - file; this is probably acceptable when we double the hash table size, since - that is such an expensive operation already. - In the case of increasing the file size, there is an optimization we can - use: if we use M in the formula above as the file size rounded up to the - next power of 2, we only need reshuffle free lists when the file size crosses - a power of 2 boundary, -\emph on -and -\emph default -reshuffling the free lists is trivial: we simply merge every consecutive - pair of free lists. +Unfortunately it is difficult to know what heuristics should be used to + determine zone sizes, and our transaction code relies on being able to + create a +\begin_inset Quotes eld +\end_inset + +recovery area +\begin_inset Quotes erd +\end_inset + + by simply appending to the file (difficult if it would need to create a + new zone header). + Thus we use a linked-list of free tables; currently we only ever create + one, but if there is more than one we choose one at random to use. + In future we may use heuristics to add new free tables on contention. + We only expand the file when all free tables are exhausted. \end_layout \begin_layout Standard @@ -1557,7 +1854,7 @@ The basic algorithm is as follows. \end_layout \begin_layout Enumerate -Identify the correct zone. +Identify the correct free list. \end_layout \begin_layout Enumerate @@ -1565,12 +1862,12 @@ Lock the corresponding list. \end_layout \begin_layout Enumerate -Re-check the zone (we didn't have a lock, sizes could have changed): relock +Re-check the list (we didn't have a lock, sizes could have changed): relock if necessary. \end_layout \begin_layout Enumerate -Place the freed entry in the list for that zone. +Place the freed entry in the list. \end_layout \begin_layout Standard @@ -1579,15 +1876,7 @@ Allocation is a little more complicated, as we perform delayed coalescing \end_layout \begin_layout Enumerate -Pick a zone either the zone we last freed into, or based on a -\begin_inset Quotes eld -\end_inset - -random -\begin_inset Quotes erd -\end_inset - - number. +Pick a free table; usually the previous one. \end_layout \begin_layout Enumerate @@ -1595,16 +1884,16 @@ Lock the corresponding list. \end_layout \begin_layout Enumerate -Re-check the zone: relock if necessary. +If the top entry is -large enough, remove it from the list and return it. \end_layout \begin_layout Enumerate -If the top entry is -large enough, remove it from the list and return it. +Otherwise, coalesce entries in the list.If there was no entry large enough, + unlock the list and try the next largest list \end_layout \begin_layout Enumerate -Otherwise, coalesce entries in the list.If there was no entry large enough, - unlock the list and try the next zone. +If no list has an entry which meets our needs, try the next free table. \end_layout \begin_layout Enumerate @@ -1636,9 +1925,8 @@ reference "sub:Records-Incur-A" \end_layout \begin_layout Standard -I anticipate that the number of entries in each free zone would be small, - but it might be worth using one free entry to hold pointers to the others - for cache efficiency. +Each free entry has the free table number in the header: less than 255. + It also contains a doubly-linked list for easy deletion. \end_layout \begin_layout Subsection @@ -1840,6 +2128,15 @@ miss it reduces 99.9% of false memcmp). As an aside, as the lower bits are already incorporated in the hash table resolution, the upper bits should be used here. + Note that it's not clear that these bits will be a win, given the extra + bits in the hash table itself (see +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:Hash-Size-Solution" + +\end_inset + +). \end_layout \begin_layout Enumerate @@ -1876,11 +2173,11 @@ struct tdb_used_record { \end_layout \begin_layout LyX-Code - uint32_t magic : 16, + uint32_t used_magic : 16, \end_layout \begin_layout LyX-Code - prev_is_free: 1, + \end_layout \begin_layout LyX-Code @@ -1888,7 +2185,7 @@ struct tdb_used_record { \end_layout \begin_layout LyX-Code - top_hash: 10; + top_hash: 11; \end_layout \begin_layout LyX-Code @@ -1912,27 +2209,48 @@ struct tdb_free_record { \end_layout \begin_layout LyX-Code - uint32_t free_magic; + uint64_t free_magic: 8, \end_layout \begin_layout LyX-Code - uint64_t total_length; + prev : 56; \end_layout \begin_layout LyX-Code - ... + \end_layout \begin_layout LyX-Code - uint64_t tailer; + uint64_t free_table: 8, \end_layout \begin_layout LyX-Code -}; + total_length : 56 +\end_layout + +\begin_layout LyX-Code + uint64_t next;; \end_layout \begin_layout LyX-Code +}; +\end_layout + +\begin_layout Standard + +\change_deleted 0 1291206079 + +\change_unchanged +Note that by limiting valid offsets to 56 bits, we can pack everything we + need into 3 64-byte words, meaning our minimum record size is 8 bytes. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout +\begin_layout Standard +Complete. \end_layout \begin_layout Subsection @@ -2025,6 +2343,14 @@ Checking for recovery means identifying the latest bundle with a valid checksum a transaction in progress; we need only check for recovery if this is set. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Deferred. +\end_layout + \begin_layout Subsection \begin_inset CommandInset label LatexCommand label @@ -2036,11 +2362,7 @@ TDB Does Not Have Snapshot Support \end_layout \begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -None. +Proposed SolutionNone. At some point you say \begin_inset Quotes eld \end_inset @@ -2049,7 +2371,14 @@ use a real database \begin_inset Quotes erd \end_inset -. + (but see +\begin_inset CommandInset ref +LatexCommand ref +reference "replay-attribute" + +\end_inset + +). \end_layout \begin_layout Standard @@ -2074,6 +2403,14 @@ We could then implement snapshots using a similar method, using multiple different hash tables/free tables. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Deferred. +\end_layout + \begin_layout Subsection Transactions Cannot Operate in Parallel \end_layout @@ -2091,13 +2428,29 @@ Proposed Solution \end_layout \begin_layout Standard -We could solve a small part of the problem by providing read-only transactions. +None (but see +\begin_inset CommandInset ref +LatexCommand ref +reference "replay-attribute" + +\end_inset + +). + We could solve a small part of the problem by providing read-only transactions. These would allow one write transaction to begin, but it could not commit until all r/o transactions are done. This would require a new RO_TRANSACTION_LOCK, which would be upgraded on commit. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Deferred. +\end_layout + \begin_layout Subsection Default Hash Function Is Suboptimal \end_layout @@ -2138,6 +2491,14 @@ The seed should be created at tdb-creation time from some random source, hash bombing. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + \begin_layout Subsection \begin_inset CommandInset label LatexCommand label @@ -2175,6 +2536,14 @@ reference "traverse-Proposed-Solution" . \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + \begin_layout Subsection Fcntl Locking Adds Overhead \end_layout @@ -2278,5 +2647,48 @@ At some later point, a sync would allow recovery of the old data into the transactions until it encountered an invalid checksum. \end_layout +\begin_layout Subsection +Tracing Is Fragile, Replay Is External +\end_layout + +\begin_layout Standard +The current TDB has compile-time-enabled tracing code, but it often breaks + as it is not enabled by default. + In a similar way, the ctdb code has an external wrapper which does replay + tracing so it can coordinate cluster-wide transactions. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\begin_inset CommandInset label +LatexCommand label +name "replay-attribute" + +\end_inset + + +\end_layout + +\begin_layout Standard +Tridge points out that an attribute can be later added to tdb_open (see + +\begin_inset CommandInset ref +LatexCommand ref +reference "attributes" + +\end_inset + +) to provide replay/trace hooks, which could become the basis for this and + future parallel transactions and snapshot support. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Deferred. +\end_layout + \end_body \end_document