X-Git-Url: https://git.ozlabs.org/?p=ccan;a=blobdiff_plain;f=ccan%2Ftdb2%2Fdoc%2Fdesign.lyx%2Cv;h=03d9a4bcb4cd01ef155c861f9daa53c8770b2139;hp=e44b3fc4b1f02d644d135cce182cf12df9b6ebe1;hb=a42bba8ec446284256a7c9146ba3525404de474c;hpb=39f01834db9b6a21d076e67d1e3143ab99aaf43e diff --git a/ccan/tdb2/doc/design.lyx,v b/ccan/tdb2/doc/design.lyx,v index e44b3fc4..03d9a4bc 100644 --- a/ccan/tdb2/doc/design.lyx,v +++ b/ccan/tdb2/doc/design.lyx,v @@ -1,10 +1,45 @@ -head 1.6; +head 1.13; access; symbols; locks; strict; comment @# @; +1.13 +date 2010.12.01.12.22.08; author rusty; state Exp; +branches; +next 1.12; + +1.12 +date 2010.12.01.12.20.49; author rusty; state Exp; +branches; +next 1.11; + +1.11 +date 2010.12.01.11.55.20; author rusty; state Exp; +branches; +next 1.10; + +1.10 +date 2010.09.14.00.33.57; author rusty; state Exp; +branches; +next 1.9; + +1.9 +date 2010.09.09.07.25.12; author rusty; state Exp; +branches; +next 1.8; + +1.8 +date 2010.09.02.02.29.05; author rusty; state Exp; +branches; +next 1.7; + +1.7 +date 2010.09.01.10.58.12; author rusty; state Exp; +branches; +next 1.6; + 1.6 date 2010.08.02.00.21.43; author rusty; state Exp; branches; @@ -41,12 +76,12 @@ desc @ -1.6 +1.13 log -@Commit changes +@Merged changes. @ text -@#LyX 1.6.5 created this file. For more info see http://www.lyx.org/ +@#LyX 1.6.7 created this file. For more info see http://www.lyx.org/ \lyxformat 345 \begin_document \begin_header @@ -83,7 +118,7 @@ text \paperpagestyle default \tracking_changes true \output_changes true -\author "" +\author "Rusty Russell,,," \author "" \end_header @@ -98,7 +133,7 @@ Rusty Russell, IBM Corporation \end_layout \begin_layout Date -26-July-2010 +1-December-2010 \end_layout \begin_layout Abstract @@ -518,6 +553,13 @@ The tdb_open() call was expanded to tdb_open_ex(), which added an optional \begin_layout Subsubsection Proposed Solution +\begin_inset CommandInset label +LatexCommand label +name "attributes" + +\end_inset + + \end_layout \begin_layout Standard @@ -621,6 +663,14 @@ This allows future attributes to be added, even if this expands the size of the union. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + \begin_layout Subsection tdb_traverse Makes Impossible Guarantees \end_layout @@ -662,6 +712,16 @@ Abandon the guarantee. You can prevent changes by using a transaction or the locking API. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. + Delete-during-traverse will still delete every record, too (assuming no + other changes). +\end_layout + \begin_layout Subsection Nesting of Transactions Is Fraught \end_layout @@ -716,6 +776,14 @@ least-surprise -obscure case. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete; nesting flag is still defined as per tdb1. +\end_layout + \begin_layout Subsection Incorrect Hash Function is Not Detected \end_layout @@ -737,6 +805,14 @@ The header should contain an example hash result (eg. hash function produces the same answer, or fail the tdb_open call. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + \begin_layout Subsection tdb_set_max_dead/TDB_VOLATILE Expose Implementation \end_layout @@ -781,6 +857,16 @@ With the scalability problems of the freelist solved, this API can be removed. tuning, but initially will become a no-op. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete. + TDB_VOLATILE still defined, but implementation should fail on unknown flags + to be future-proof. +\end_layout + \begin_layout Subsection \begin_inset CommandInset label LatexCommand label @@ -833,6 +919,14 @@ I do not see benefit in an additional tdb_open flag to indicate whether an API. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete. +\end_layout + \begin_layout Subsection TDB API Is Not POSIX Thread-safe \end_layout @@ -877,6 +971,22 @@ Internal locking is required to make sure that fcntl locks do not overlap \begin_layout Standard The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe version of the library, and otherwise no overhead will exist. + Alternatively, a hooking mechanism similar to that proposed for +\begin_inset CommandInset ref +LatexCommand ref +reference "Proposed-Solution-locking-hook" + +\end_inset + + could be used to enable pthread locking at runtime. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete. \end_layout \begin_layout Subsection @@ -994,6 +1104,14 @@ This is flexible enough to handle any potential locking scenario, even when It also keeps the complexity out of the API, and in ctdbd where it is needed. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete. +\end_layout + \begin_layout Subsection tdb_chainlock Functions Expose Implementation \end_layout @@ -1075,6 +1193,14 @@ It may be possible to make this race-free in some implementations by having \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete. +\end_layout + \begin_layout Subsection The API Uses Gratuitous Typedefs, Capitals \end_layout @@ -1151,6 +1277,14 @@ It should simply take an extra argument, since we are prepared to break the API/ABI. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + \begin_layout Subsection Various Callback Functions Are Not Typesafe \end_layout @@ -1190,6 +1324,14 @@ With careful use of macros, we can create callback functions which give See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete. +\end_layout + \begin_layout Subsection TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic \end_layout @@ -1227,6 +1369,157 @@ reference "TDB_CLEAR_IF_FIRST-Imposes-Performance" . \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete, TDB_CLEAR_IF_FIRST still defined, but not implemented. +\end_layout + +\begin_layout Subsection +Extending The Header Is Difficult +\end_layout + +\begin_layout Standard +We have reserved (zeroed) words in the TDB header, which can be used for + future features. + If the future features are compulsory, the version number must be updated + to prevent old code from accessing the database. + But if the future feature is optional, we have no way of telling if older + code is accessing the database or not. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The header should contain a +\begin_inset Quotes eld +\end_inset + +format variant +\begin_inset Quotes erd +\end_inset + + value (64-bit). + This is divided into two 32-bit parts: +\end_layout + +\begin_layout Enumerate +The lower part reflects the format variant understood by code accessing + the database. +\end_layout + +\begin_layout Enumerate +The upper part reflects the format variant you must understand to write + to the database (otherwise you can only open for reading). +\end_layout + +\begin_layout Standard +The latter field can only be written at creation time, the former should + be written under the OPEN_LOCK when opening the database for writing, if + the variant of the code is lower than the current lowest variant. +\end_layout + +\begin_layout Standard +This should allow backwards-compatible features to be added, and detection + if older code (which doesn't understand the feature) writes to the database. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete. +\end_layout + +\begin_layout Subsection +Record Headers Are Not Expandible +\end_layout + +\begin_layout Standard +If we later want to add (say) checksums on keys and data, it would require + another format change, which we'd like to avoid. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +We often have extra padding at the tail of a record. + If we ensure that the first byte (if any) of this padding is zero, we will + have a way for future changes to detect code which doesn't understand a + new format: the new code would write (say) a 1 at the tail, and thus if + there is no tail or the first byte is 0, we would know the extension is + not present on that record. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete. +\end_layout + +\begin_layout Subsection +TDB Does Not Use Talloc +\end_layout + +\begin_layout Standard +Many users of TDB (particularly Samba) use the talloc allocator, and thus + have to wrap TDB in a talloc context to use it conveniently. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\end_layout + +\begin_layout Standard +The allocation within TDB is not complicated enough to justify the use of + talloc, and I am reluctant to force another (excellent) library on TDB + users. + Nonetheless a compromise is possible. + An attribute (see +\begin_inset CommandInset ref +LatexCommand ref +reference "attributes" + +\end_inset + +) can be added later to tdb_open() to provide an alternate allocation mechanism, + specifically for talloc but usable by any other allocator (which would + ignore the +\begin_inset Quotes eld +\end_inset + +context +\begin_inset Quotes erd +\end_inset + + argument). +\end_layout + +\begin_layout Standard +This would form a talloc heirarchy as expected, but the caller would still + have to attach a destructor to the tdb context returned from tdb_open to + close it. + All TDB_DATA fields would be children of the tdb_context, and the caller + would still have to manage them (using talloc_free() or talloc_steal()). +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Deferred. +\end_layout + \begin_layout Section Performance And Scalability Issues \end_layout @@ -1282,6 +1575,14 @@ Remove the flag. point. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing. +\end_layout + \begin_layout Subsection TDB Files Have a 4G Limit \end_layout @@ -1329,6 +1630,14 @@ Old versions of tdb will fail to open the new TDB files (since 28 August be erased and initialized as a fresh tdb!) \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + \begin_layout Subsection TDB Records Have a 4G Limit \end_layout @@ -1358,6 +1667,14 @@ reference "sub:Records-Incur-A" ). \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + \begin_layout Subsection Hash Size Is Determined At TDB Creation Time \end_layout @@ -1372,6 +1689,12 @@ TDB contains a number of hash chains in the header; the number is specified \end_layout \begin_layout Subsubsection +\begin_inset CommandInset label +LatexCommand label +name "sub:Hash-Size-Solution" + +\end_inset + Proposed Solution \end_layout @@ -1390,42 +1713,39 @@ http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoyin , it became clear that it is hard to beat a straight linear hash table which doubles in size when it reaches saturation. - There are three details which become important: -\end_layout - -\begin_layout Enumerate -On encountering a full bucket, we use the next bucket. -\end_layout - -\begin_layout Enumerate -Extra hash bits are stored with the offset, to reduce comparisons. + Unfortunately, altering the hash table introduces serious locking complications +: the entire hash table needs to be locked to enlarge the hash table, and + others might be holding locks. + Particularly insidious are insertions done under tdb_chainlock. \end_layout -\begin_layout Enumerate -A marker entry is used on deleting an entry. +\begin_layout Standard +Thus an expanding layered hash will be used: an array of hash groups, with + each hash group exploding into pointers to lower hash groups once it fills, + turning into a hash tree. + This has implications for locking: we must lock the entire group in case + we need to expand it, yet we don't know how deep the tree is at that point. \end_layout \begin_layout Standard -The doubling of the table must be done under a transaction; we will not - reduce it on deletion, so it will be an unusual case. - It will either be placed at the head (other entries will be moved out the - way so we can expand). - We could have a pointer in the header to the current hashtable location, - but that pointer would have to be read frequently to check for hashtable - moves. +Note that bits from the hash table entries should be stolen to hold more + hash bits to reduce the penalty of collisions. + We can use the otherwise-unused lower 3 bits. + If we limit the size of the database to 64 exabytes, we can use the top + 8 bits of the hash entry as well. + These 11 bits would reduce false positives down to 1 in 2000 which is more + than we need: we can use one of the bits to indicate that the extra hash + bits are valid. + This means we can choose not to re-hash all entries when we expand a hash + group; simply use the next bits we need and mark them invalid. \end_layout -\begin_layout Standard -The locking for this is slightly more complex than the chained case; we - currently have one lock per bucket, and that means we would need to expand - the lock if we overflow to the next bucket. - The frequency of such collisions will effect our locking heuristics: we - can always lock more buckets than we need. +\begin_layout Subsubsection +Status \end_layout \begin_layout Standard -One possible optimization is to only re-check the hash size on an insert - or a lookup miss. +Complete. \end_layout \begin_layout Subsection @@ -1568,6 +1888,14 @@ The free list must be split to reduce contention. we can use far fewer, say 1/32 of the number of hash buckets. \end_layout +\begin_layout Standard +It seems tempting to try to reuse the hash implementation which we use for + records here, but we have two ways of searching for free entries: for allocatio +n we search by size (and possibly zone) which produces too many clashes + for our hash table to handle well, and for coalescing we search by address. + Thus an array of doubly-linked free lists seems preferable. +\end_layout + \begin_layout Standard There are various benefits in using per-size free lists (see \begin_inset CommandInset ref @@ -1579,24 +1907,28 @@ reference "sub:TDB-Becomes-Fragmented" ) but it's not clear this would reduce contention in the common case where all processes are allocating/freeing the same size. Thus we almost certainly need to divide in other ways: the most obvious - is to divide the file into zones, and using a free list (or set of free + is to divide the file into zones, and using a free list (or table of free lists) for each. This approximates address ordering. \end_layout \begin_layout Standard -Note that this means we need to split the free lists when we expand the - file; this is probably acceptable when we double the hash table size, since - that is such an expensive operation already. - In the case of increasing the file size, there is an optimization we can - use: if we use M in the formula above as the file size rounded up to the - next power of 2, we only need reshuffle free lists when the file size crosses - a power of 2 boundary, -\emph on -and -\emph default -reshuffling the free lists is trivial: we simply merge every consecutive - pair of free lists. +Unfortunately it is difficult to know what heuristics should be used to + determine zone sizes, and our transaction code relies on being able to + create a +\begin_inset Quotes eld +\end_inset + +recovery area +\begin_inset Quotes erd +\end_inset + + by simply appending to the file (difficult if it would need to create a + new zone header). + Thus we use a linked-list of free tables; currently we only ever create + one, but if there is more than one we choose one at random to use. + In future we may use heuristics to add new free tables on contention. + We only expand the file when all free tables are exhausted. \end_layout \begin_layout Standard @@ -1605,7 +1937,7 @@ The basic algorithm is as follows. \end_layout \begin_layout Enumerate -Identify the correct zone. +Identify the correct free list. \end_layout \begin_layout Enumerate @@ -1613,12 +1945,12 @@ Lock the corresponding list. \end_layout \begin_layout Enumerate -Re-check the zone (we didn't have a lock, sizes could have changed): relock +Re-check the list (we didn't have a lock, sizes could have changed): relock if necessary. \end_layout \begin_layout Enumerate -Place the freed entry in the list for that zone. +Place the freed entry in the list. \end_layout \begin_layout Standard @@ -1627,15 +1959,7 @@ Allocation is a little more complicated, as we perform delayed coalescing \end_layout \begin_layout Enumerate -Pick a zone either the zone we last freed into, or based on a -\begin_inset Quotes eld -\end_inset - -random -\begin_inset Quotes erd -\end_inset - - number. +Pick a free table; usually the previous one. \end_layout \begin_layout Enumerate @@ -1643,16 +1967,16 @@ Lock the corresponding list. \end_layout \begin_layout Enumerate -Re-check the zone: relock if necessary. +If the top entry is -large enough, remove it from the list and return it. \end_layout \begin_layout Enumerate -If the top entry is -large enough, remove it from the list and return it. +Otherwise, coalesce entries in the list.If there was no entry large enough, + unlock the list and try the next largest list \end_layout \begin_layout Enumerate -Otherwise, coalesce entries in the list.If there was no entry large enough, - unlock the list and try the next zone. +If no list has an entry which meets our needs, try the next free table. \end_layout \begin_layout Enumerate @@ -1684,9 +2008,8 @@ reference "sub:Records-Incur-A" \end_layout \begin_layout Standard -I anticipate that the number of entries in each free zone would be small, - but it might be worth using one free entry to hold pointers to the others - for cache efficiency. +Each free entry has the free table number in the header: less than 255. + It also contains a doubly-linked list for easy deletion. \end_layout \begin_layout Subsection @@ -1888,6 +2211,15 @@ miss it reduces 99.9% of false memcmp). As an aside, as the lower bits are already incorporated in the hash table resolution, the upper bits should be used here. + Note that it's not clear that these bits will be a win, given the extra + bits in the hash table itself (see +\begin_inset CommandInset ref +LatexCommand ref +reference "sub:Hash-Size-Solution" + +\end_inset + +). \end_layout \begin_layout Enumerate @@ -1924,11 +2256,11 @@ struct tdb_used_record { \end_layout \begin_layout LyX-Code - uint32_t magic : 16, + uint32_t used_magic : 16, \end_layout \begin_layout LyX-Code - prev_is_free: 1, + \end_layout \begin_layout LyX-Code @@ -1936,7 +2268,7 @@ struct tdb_used_record { \end_layout \begin_layout LyX-Code - top_hash: 10; + top_hash: 11; \end_layout \begin_layout LyX-Code @@ -1960,27 +2292,48 @@ struct tdb_free_record { \end_layout \begin_layout LyX-Code - uint32_t free_magic; + uint64_t free_magic: 8, \end_layout \begin_layout LyX-Code - uint64_t total_length; + prev : 56; \end_layout \begin_layout LyX-Code - ... + \end_layout \begin_layout LyX-Code - uint64_t tailer; + uint64_t free_table: 8, \end_layout \begin_layout LyX-Code -}; + total_length : 56 +\end_layout + +\begin_layout LyX-Code + uint64_t next;; \end_layout \begin_layout LyX-Code +}; +\end_layout + +\begin_layout Standard + +\change_deleted 0 1291206079 + +\change_unchanged +Note that by limiting valid offsets to 56 bits, we can pack everything we + need into 3 64-byte words, meaning our minimum record size is 8 bytes. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout +\begin_layout Standard +Complete. \end_layout \begin_layout Subsection @@ -2073,6 +2426,14 @@ Checking for recovery means identifying the latest bundle with a valid checksum a transaction in progress; we need only check for recovery if this is set. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Deferred. +\end_layout + \begin_layout Subsection \begin_inset CommandInset label LatexCommand label @@ -2084,11 +2445,7 @@ TDB Does Not Have Snapshot Support \end_layout \begin_layout Subsubsection -Proposed Solution -\end_layout - -\begin_layout Standard -None. +Proposed SolutionNone. At some point you say \begin_inset Quotes eld \end_inset @@ -2097,7 +2454,14 @@ use a real database \begin_inset Quotes erd \end_inset -. + (but see +\begin_inset CommandInset ref +LatexCommand ref +reference "replay-attribute" + +\end_inset + +). \end_layout \begin_layout Standard @@ -2122,6 +2486,14 @@ We could then implement snapshots using a similar method, using multiple different hash tables/free tables. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Deferred. +\end_layout + \begin_layout Subsection Transactions Cannot Operate in Parallel \end_layout @@ -2139,13 +2511,29 @@ Proposed Solution \end_layout \begin_layout Standard -We could solve a small part of the problem by providing read-only transactions. +None (but see +\begin_inset CommandInset ref +LatexCommand ref +reference "replay-attribute" + +\end_inset + +). + We could solve a small part of the problem by providing read-only transactions. These would allow one write transaction to begin, but it could not commit until all r/o transactions are done. This would require a new RO_TRANSACTION_LOCK, which would be upgraded on commit. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Deferred. +\end_layout + \begin_layout Subsection Default Hash Function Is Suboptimal \end_layout @@ -2186,6 +2574,14 @@ The seed should be created at tdb-creation time from some random source, hash bombing. \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + \begin_layout Subsection \begin_inset CommandInset label LatexCommand label @@ -2223,6 +2619,14 @@ reference "traverse-Proposed-Solution" . \end_layout +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Complete. +\end_layout + \begin_layout Subsection Fcntl Locking Adds Overhead \end_layout @@ -2326,11 +2730,1139 @@ At some later point, a sync would allow recovery of the old data into the transactions until it encountered an invalid checksum. \end_layout +\begin_layout Subsection +Tracing Is Fragile, Replay Is External +\end_layout + +\begin_layout Standard +The current TDB has compile-time-enabled tracing code, but it often breaks + as it is not enabled by default. + In a similar way, the ctdb code has an external wrapper which does replay + tracing so it can coordinate cluster-wide transactions. +\end_layout + +\begin_layout Subsubsection +Proposed Solution +\begin_inset CommandInset label +LatexCommand label +name "replay-attribute" + +\end_inset + + +\end_layout + +\begin_layout Standard +Tridge points out that an attribute can be later added to tdb_open (see + +\begin_inset CommandInset ref +LatexCommand ref +reference "attributes" + +\end_inset + +) to provide replay/trace hooks, which could become the basis for this and + future parallel transactions and snapshot support. +\end_layout + +\begin_layout Subsubsection +Status +\end_layout + +\begin_layout Standard +Deferred. +\end_layout + \end_body \end_document @ +1.12 +log +@Add status, some fixes, linked freelists. +@ +text +@d53 1 +a53 7 + +\change_deleted 0 1291204535 +14-September +\change_inserted 0 1291204533 +1-December +\change_unchanged +-2010 +a580 2 +\change_inserted 0 1291204563 + +a583 2 + +\change_inserted 0 1291204572 +a587 2 + +\change_inserted 0 1291204573 +a588 2 +\change_unchanged + +a629 2 +\change_inserted 0 1291204588 + +a632 2 + +\change_inserted 0 1291204588 +a636 2 + +\change_inserted 0 1291204631 +a639 2 +\change_unchanged + +a693 2 +\change_inserted 0 1291204639 + +a696 2 + +\change_inserted 0 1291204640 +a700 2 + +\change_inserted 0 1291204665 +a701 2 +\change_unchanged + +a722 2 +\change_inserted 0 1291204671 + +a725 2 + +\change_inserted 0 1291204671 +a729 2 + +\change_inserted 0 1291204673 +a730 2 +\change_unchanged + +a774 2 +\change_inserted 0 1291204731 + +a777 2 + +\change_inserted 0 1291204732 +a781 2 + +\change_inserted 0 1291204779 +a784 2 +\change_unchanged + +a836 2 +\change_inserted 0 1291204830 + +a839 2 + +\change_inserted 0 1291204831 +a843 2 + +\change_inserted 0 1291204834 +a844 2 +\change_unchanged + +a898 2 +\change_inserted 0 1291204847 + +a901 2 + +\change_inserted 0 1291204847 +a905 2 + +\change_inserted 0 1291204852 +a906 2 +\change_unchanged + +a1021 2 +\change_inserted 0 1291204881 + +a1024 2 + +\change_inserted 0 1291204881 +a1028 2 + +\change_inserted 0 1291204885 +a1029 2 +\change_unchanged + +a1110 2 +\change_inserted 0 1291204898 + +a1113 2 + +\change_inserted 0 1291204898 +a1117 2 + +\change_inserted 0 1291204901 +a1118 2 +\change_unchanged + +a1194 2 +\change_inserted 0 1291204908 + +a1197 2 + +\change_inserted 0 1291204908 +a1201 2 + +\change_inserted 0 1291204908 +a1202 2 +\change_unchanged + +a1241 2 +\change_inserted 0 1291204917 + +a1244 2 + +\change_inserted 0 1291204917 +a1248 2 + +\change_inserted 0 1291204920 +a1249 2 +\change_unchanged + +a1286 2 +\change_inserted 0 1291204927 + +a1289 2 + +\change_inserted 0 1291204928 +a1293 2 + +\change_inserted 0 1291204942 +a1294 2 +\change_unchanged + +a1345 2 +\change_inserted 0 1291205003 + +a1348 2 + +\change_inserted 0 1291205004 +a1352 2 + +\change_inserted 0 1291205007 +a1375 2 +\change_inserted 0 1291205019 + +a1378 2 + +\change_inserted 0 1291205019 +a1382 2 + +\change_inserted 0 1291205023 +a1383 2 +\change_unchanged + +a1429 2 +\change_inserted 0 1291205029 + +a1432 2 + +\change_inserted 0 1291205029 +a1436 2 + +\change_inserted 0 1291206020 +a1437 2 +\change_unchanged + +a1492 2 +\change_inserted 0 1291205043 + +a1495 2 + +\change_inserted 0 1291205043 +a1499 2 + +\change_inserted 0 1291205057 +a1500 2 +\change_unchanged + +a1547 2 +\change_inserted 0 1291205062 + +a1550 2 + +\change_inserted 0 1291205062 +a1554 2 + +\change_inserted 0 1291205062 +a1555 2 +\change_unchanged + +a1584 2 +\change_inserted 0 1291205072 + +a1587 2 + +\change_inserted 0 1291205073 +a1591 2 + +\change_inserted 0 1291205073 +a1592 2 +\change_unchanged + +a1632 4 + +\change_deleted 0 1291204504 + +\change_unchanged +a1657 2 +\change_inserted 0 1291205079 + +a1660 2 + +\change_inserted 0 1291205080 +a1664 2 + +\change_inserted 0 1291205080 +a1665 2 +\change_unchanged + +a1791 2 +\change_inserted 0 1291205090 + +d1827 2 +a1828 7 + is to divide the file into zones, and using a free list (or +\change_inserted 0 1291205498 +table +\change_deleted 0 1291205497 +set +\change_unchanged + of free lists) for each. +a1829 2 +\change_inserted 0 1291205203 + +a1832 2 + +\change_inserted 0 1291205358 +a1848 21 +\change_unchanged + +\end_layout + +\begin_layout Standard + +\change_deleted 0 1291205198 +Note that this means we need to split the free lists when we expand the + file; this is probably acceptable when we double the hash table size, since + that is such an expensive operation already. + In the case of increasing the file size, there is an optimization we can + use: if we use M in the formula above as the file size rounded up to the + next power of 2, we only need reshuffle free lists when the file size crosses + a power of 2 boundary, +\emph on +and +\emph default +reshuffling the free lists is trivial: we simply merge every consecutive + pair of free lists. +\change_unchanged + +d1857 1 +a1857 7 +Identify the correct +\change_inserted 0 1291205366 +free list +\change_deleted 0 1291205364 +zone +\change_unchanged +. +d1865 2 +a1866 7 +Re-check the +\change_inserted 0 1291205372 +list +\change_deleted 0 1291205371 +zone +\change_unchanged + (we didn't have a lock, sizes could have changed): relock if necessary. +d1870 1 +a1870 5 +Place the freed entry in the list +\change_deleted 0 1291205382 + for that zone +\change_unchanged +. +d1879 1 +a1879 15 +Pick a +\change_deleted 0 1291205403 +zone either the zone we last freed into, or based on a +\begin_inset Quotes eld +\end_inset + +random +\begin_inset Quotes erd +\end_inset + + number. +\change_inserted 0 1291205411 +free table; usually the previous one. +\change_unchanged + +a1883 10 +\change_deleted 0 1291205432 + +\end_layout + +\begin_layout Enumerate + +\change_deleted 0 1291205428 +Re-check the zone: relock if necessary. +\change_unchanged + +d1892 1 +a1892 7 + unlock the list and try the next +\change_inserted 0 1291205455 +largest list +\change_deleted 0 1291205452 +zone. +\change_inserted 0 1291205457 + +a1895 2 + +\change_inserted 0 1291205476 +a1896 2 +\change_unchanged + +a1924 2 +\change_inserted 0 1291205542 + +a1927 2 + +\change_inserted 0 1291205591 +a1929 70 +\change_unchanged + +\end_layout + +\begin_layout Standard + +\change_deleted 0 1291205539 +I anticipate that the number of entries in each free zone would be small, + but it might be worth using one free entry to hold pointers to the others + for cache efficiency. +\change_unchanged + +\end_layout + +\begin_layout Standard + +\change_deleted 0 1291205534 +\begin_inset CommandInset label +LatexCommand label +name "freelist-in-zone" + +\end_inset + +If we want to avoid locking complexity (enlarging the free lists when we + enlarge the file) we could place the array of free lists at the beginning + of each zone. + This means existing array lists never move, but means that a record cannot + be larger than a zone. + That in turn implies that zones should be variable sized (say, power of + 2), which makes the question +\begin_inset Quotes eld +\end_inset + +what zone is this record in? +\begin_inset Quotes erd +\end_inset + + much harder (and +\begin_inset Quotes eld +\end_inset + +pick a random zone +\begin_inset Quotes erd +\end_inset + +, but that's less common). + It could be done with as few as 4 bits from the record header. +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout +Using +\begin_inset Formula $2^{16+N*3}$ +\end_inset + +means 0 gives a minimal 65536-byte zone, 15 gives the maximal +\begin_inset Formula $2^{61}$ +\end_inset + + byte zone. + Zones range in factor of 8 steps. + Given the zone size for the zone the current record is in, we can determine + the start of the zone. +\end_layout + +\end_inset + + +\change_inserted 0 1291205139 + +d2176 1 +a2176 5 + uint32_t +\change_inserted 0 1291205758 +used_ +\change_unchanged +magic : 16, +a2180 4 +\change_deleted 0 1291205693 + prev_is_free: 1, +\change_unchanged + +d2188 1 +a2188 7 + top_hash: 1 +\change_inserted 0 1291205704 +1 +\change_deleted 0 1291205704 +0 +\change_unchanged +; +d2212 1 +a2212 9 + uint +\change_inserted 0 1291205725 +64 +\change_deleted 0 1291205723 +32 +\change_unchanged +_t +\change_inserted 0 1291205753 +free_magic: 8, +a2215 2 + +\change_inserted 0 1291205746 +a2220 24 +\change_deleted 0 1291205749 +free_magic; +\change_unchanged + +\end_layout + +\begin_layout LyX-Code + uint64_t +\change_inserted 0 1291205786 +free_table: 8, +\end_layout + +\begin_layout LyX-Code + +\change_inserted 0 1291205788 + +\change_unchanged +total_length +\change_inserted 0 1291205792 + : 56 +\change_deleted 0 1291205790 +; +\change_unchanged + +d2224 1 +a2224 7 + uint64_t +\change_deleted 0 1291205801 +prev, +\change_unchanged +next; +\change_deleted 0 1291205811 + +d2228 1 +a2228 3 + +\change_deleted 0 1291205811 + ... +d2232 1 +a2232 5 + +\change_deleted 0 1291205808 + uint64_t tailer +\change_unchanged +; +d2241 5 +a2245 16 +\change_deleted 0 1291205827 +We might want to take some bits from the used record's top_hash (and the + free record which has 32 bits of padding to spare anyway) if we use variable + sized zones. + See +\begin_inset CommandInset ref +LatexCommand ref +reference "freelist-in-zone" + +\end_inset + +. + +\change_inserted 0 1291205885 + Note that by limiting valid offsets to 56 bits, we can pack everything + we need into 3 64-byte words, meaning our minimum record size is 8 bytes. +a2248 2 + +\change_inserted 0 1291205886 +a2252 2 + +\change_inserted 0 1291205886 +a2253 2 +\change_unchanged + +a2343 2 +\change_inserted 0 1291205894 + +a2346 2 + +\change_inserted 0 1291205894 +a2350 2 + +\change_inserted 0 1291205902 +a2351 2 +\change_unchanged + +a2373 4 + +\change_deleted 0 1291204504 + +\change_unchanged +a2403 2 +\change_inserted 0 1291205910 + +a2406 2 + +\change_inserted 0 1291205910 +a2410 2 + +\change_inserted 0 1291205914 +a2411 2 +\change_unchanged + +a2443 2 +\change_inserted 0 1291205919 + +a2446 2 + +\change_inserted 0 1291205919 +a2450 2 + +\change_inserted 0 1291205922 +a2451 2 +\change_unchanged + +a2491 2 +\change_inserted 0 1291205929 + +a2494 2 + +\change_inserted 0 1291205929 +a2498 2 + +\change_inserted 0 1291205929 +a2499 2 +\change_unchanged + +a2536 2 +\change_inserted 0 1291205932 + +a2539 2 + +\change_inserted 0 1291205933 +a2543 2 + +\change_inserted 0 1291205933 +a2544 2 +\change_unchanged + +a2682 2 +\change_inserted 0 1291205944 + +a2685 2 + +\change_inserted 0 1291205945 +a2689 2 + +\change_inserted 0 1291205948 +a2690 2 +\change_unchanged + +@ + + +1.11 +log +@Merge changes +@ +text +@d53 7 +a59 1 +14-September-2010 +d587 16 +d644 18 +d716 16 +d753 16 +d813 18 +d883 16 +d953 16 +d1084 16 +d1181 16 +d1273 16 +d1328 16 +d1381 16 +d1447 19 +a1465 2 + if older code (which doesn't understand the feature) writes to the database.Reco +rd Headers Are Not Expandible +d1484 16 +d1546 16 +d1617 16 +d1680 16 +d1725 16 +d1810 16 +d1951 8 +a1958 3 +Proposed SolutionThe first step is to remove all the current heuristics, + as they obviously interact, then examine them once the lock contention + is addressed. +d1989 7 +a1995 2 + is to divide the file into zones, and using a free list (or set of free + lists) for each. +d1997 2 +d2002 25 +d2039 2 +d2049 7 +a2055 1 +Identify the correct zone. +d2063 7 +a2069 2 +Re-check the zone (we didn't have a lock, sizes could have changed): relock + if necessary. +d2073 5 +a2077 1 +Place the freed entry in the list for that zone. +d2086 3 +a2088 1 +Pick a zone either the zone we last freed into, or based on a +d2097 4 +d2105 2 +d2110 2 +d2113 2 +d2123 15 +a2137 1 + unlock the list and try the next zone. +d2166 11 +d2180 2 +d2185 2 +d2190 2 +d2223 1 +a2223 1 +status open +d2243 2 +d2491 5 +a2495 1 + uint32_t magic : 16, +d2499 2 +d2502 2 +d2511 7 +a2517 1 + top_hash: 10; +d2541 29 +a2569 1 + uint32_t free_magic; +d2573 11 +a2583 1 + uint64_t total_length; +d2587 7 +a2593 1 + uint64_t prev, next; +d2597 2 +d2603 5 +a2607 1 + uint64_t tailer; +d2615 2 +d2628 18 +d2736 16 +d2808 16 +d2856 16 +d2912 16 +d2965 16 +d3119 16 +@ + + +1.10 +log +@Tracing attribute, talloc support. +@ +text +@d1 1 +a1 1 +#LyX 1.6.5 created this file. For more info see http://www.lyx.org/ +d53 1 +a53 7 + +\change_deleted 0 1283307542 +26-July +\change_inserted 0 1284423485 +14-September +\change_unchanged +-2010 +a472 2 +\change_inserted 0 1284422789 + +a479 2 +\change_unchanged + +a838 2 + +\change_inserted 0 1284016998 +a846 2 +\change_unchanged + +a1194 2 +\change_inserted 0 1284015637 + +a1197 2 + +\change_inserted 0 1284015716 +a1201 2 + +\change_inserted 0 1284015906 +a1210 2 + +\change_inserted 0 1284015637 +a1214 2 + +\change_inserted 0 1284016114 +a1227 2 + +\change_inserted 0 1284016149 +a1232 2 + +\change_inserted 0 1284016639 +a1237 2 + +\change_inserted 0 1284016821 +a1243 2 + +\change_inserted 0 1284016803 +d1245 2 +a1246 9 + if older code (which doesn't understand the feature) writes to the database. +\change_deleted 0 1284016101 + +\end_layout + +\begin_layout Subsection + +\change_inserted 0 1284015634 +Record Headers Are Not Expandible +a1249 2 + +\change_inserted 0 1284015634 +a1254 2 + +\change_inserted 0 1284015634 +a1258 2 + +\change_inserted 0 1284422552 +a1267 2 + +\change_inserted 0 1284422568 +a1271 2 + +\change_inserted 0 1284422646 +a1276 2 + +\change_inserted 0 1284422656 +a1280 2 + +\change_inserted 0 1284423065 +a1305 2 + +\change_inserted 0 1284423042 +a1310 2 +\change_unchanged + +a1457 2 + +\change_inserted 0 1283336713 +a1463 2 + +\change_unchanged +d1482 2 +d1485 1 +a1485 51 +\change_deleted 0 1283307675 +There are three details which become important: +\end_layout + +\begin_layout Enumerate + +\change_deleted 0 1283307675 +On encountering a full bucket, we use the next bucket. +\end_layout + +\begin_layout Enumerate + +\change_deleted 0 1283307675 +Extra hash bits are stored with the offset, to reduce comparisons. +\end_layout + +\begin_layout Enumerate + +\change_deleted 0 1283307675 +A marker entry is used on deleting an entry. +\end_layout + +\begin_layout Standard + +\change_deleted 0 1283307675 +The doubling of the table must be done under a transaction; we will not + reduce it on deletion, so it will be an unusual case. + It will either be placed at the head (other entries will be moved out the + way so we can expand). + We could have a pointer in the header to the current hashtable location, + but that pointer would have to be read frequently to check for hashtable + moves. +\end_layout + +\begin_layout Standard + +\change_deleted 0 1283307675 +The locking for this is slightly more complex than the chained case; we + currently have one lock per bucket, and that means we would need to expand + the lock if we overflow to the next bucket. + The frequency of such collisions will effect our locking heuristics: we + can always lock more buckets than we need. +\end_layout + +\begin_layout Standard + +\change_deleted 0 1283307675 +One possible optimization is to only re-check the hash size on an insert + or a lookup miss. + +\change_inserted 0 1283307770 +a1492 2 + +\change_inserted 0 1283336187 +a1500 2 + +\change_inserted 0 1283336586 +a1510 2 +\change_unchanged + +d1636 3 +a1638 8 +Proposed Solution +\change_deleted 0 1283336858 + +\end_layout + +\begin_layout Standard +The first step is to remove all the current heuristics, as they obviously + interact, then examine them once the lock contention is addressed. +a1647 2 +\change_inserted 0 1283336910 + +a1650 2 + +\change_inserted 0 1283337052 +a1655 2 +\change_unchanged + +a1776 2 +\change_inserted 0 1283309850 + +a1779 2 + +\change_inserted 0 1283337216 +a1813 2 + +\change_inserted 0 1284424151 +a1825 2 +\change_unchanged + +a1830 2 +\change_unchanged + +a2031 2 + +\change_inserted 0 1283336739 +a2040 2 +\change_unchanged + +a2117 2 +\change_inserted 0 1283337133 + +a2120 2 + +\change_inserted 0 1283337139 +a2121 2 +\change_unchanged + +a2136 2 + +\change_inserted 0 1283337235 +a2147 2 +\change_unchanged + +d2251 1 +a2251 7 +Proposed Solution +\change_deleted 0 1284423472 + +\end_layout + +\begin_layout Standard +None. +d2261 1 +a2261 1 +\change_inserted 0 1284423891 +d2263 1 +a2263 4 +\change_deleted 0 1284423891 +. + +\change_inserted 0 1284423901 +a2271 2 +\change_unchanged + +a2293 2 +\change_inserted 0 1284423495 + +a2312 2 + +\change_inserted 0 1284424201 +d2321 1 +a2321 3 + +\change_unchanged +We could solve a small part of the problem by providing read-only transactions. +a2505 2 +\change_inserted 0 1284423555 + +a2508 2 + +\change_inserted 0 1284423617 +a2512 2 + +\change_inserted 0 1284423719 +a2519 2 + +\change_inserted 0 1284423864 +a2530 2 + +\change_inserted 0 1284423850 +a2540 2 +\change_unchanged + +@ + + +1.9 +log +@Extension mechanism. +@ +text +@d56 2 +a57 2 +\change_inserted 0 1284016854 +9-September +d479 11 +d1303 1 +a1303 1 +\change_inserted 0 1284016847 +d1310 56 +d1945 1 +a1945 1 +\change_inserted 0 1283310945 +d1956 2 +d2402 2 +d2416 4 +d2421 12 +d2455 2 +d2476 12 +d2673 47 +@ + + +1.8 +log +@Remove bogus footnote +@ +text +@d56 2 +a57 2 +\change_inserted 0 1283307544 +1-September +d838 12 +d1198 103 +@ + + +1.7 +log +@Moving hash table does not work. +@ +text +@a1436 12 +\begin_inset Foot +status collapsed + +\begin_layout Plain Layout + +\change_inserted 0 1283336450 +If we make the hash offsets zone-relative, then this only restricts the + zone size, not the overall database size. +\end_layout + +\end_inset + +@ + + +1.6 +log +@Commit changes +@ +text +@d38 1 +a38 1 +\author "" +d53 7 +a59 1 +26-July-2010 +d1333 10 +d1361 3 +a1363 1 + There are three details which become important: +d1367 2 +d1373 2 +d1379 2 +d1385 2 +d1397 2 +d1407 2 +d1411 45 +d1582 2 +d1598 14 +d1733 62 +d1996 13 +d2086 10 +d2110 15 +a2124 1 +\begin_layout LyX-Code +@ + + 1.5 log @Soft transaction commit