tdb2: update design doc.

[ccan] / ccan / tdb2 / doc / design.lyx
diff --git a/ccan/tdb2/doc/design.lyx b/ccan/tdb2/doc/design.lyx

index 51378c3372ecbe7e17756d1749fe7a4373db68a7..ca17f8fece511ee9e31d1fb7b7280f11429273fc 100644 (file)
--- a/ccan/tdb2/doc/design.lyx
+++ b/ccan/tdb2/doc/design.lyx
@@ -35,7 +35,7 @@
  \paperpagestyle default
  \tracking_changes true
  \output_changes true
-\author "" 
+\author "Rusty Russell,,," 
  \author "" 
  \end_header
  
@@ -50,7 +50,13 @@ Rusty Russell, IBM Corporation
  \end_layout
  
  \begin_layout Date
-26-July-2010
+
+\change_deleted 0 1283307542
+26-July
+\change_inserted 0 1284423485
+14-September
+\change_unchanged
+-2010
  \end_layout
  
  \begin_layout Abstract
@@ -470,6 +476,17 @@ The tdb_open() call was expanded to tdb_open_ex(), which added an optional
  
  \begin_layout Subsubsection
  Proposed Solution
+\change_inserted 0 1284422789
+
+\begin_inset CommandInset label
+LatexCommand label
+name "attributes"
+
+\end_inset
+
+
+\change_unchanged
+
  \end_layout
  
  \begin_layout Standard
@@ -829,6 +846,18 @@ Internal locking is required to make sure that fcntl locks do not overlap
  \begin_layout Standard
  The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
   version of the library, and otherwise no overhead will exist.
+
+\change_inserted 0 1284016998
+ Alternatively, a hooking mechanism similar to that proposed for 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "Proposed-Solution-locking-hook"
+
+\end_inset
+
+ could be used to enable pthread locking at runtime.
+\change_unchanged
+
  \end_layout
  
  \begin_layout Subsection
@@ -1177,6 +1206,165 @@ reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
  \end_inset
  
  .
+\change_inserted 0 1284015637
+
+\end_layout
+
+\begin_layout Subsection
+
+\change_inserted 0 1284015716
+Extending The Header Is Difficult
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 0 1284015906
+We have reserved (zeroed) words in the TDB header, which can be used for
+ future features.
+ If the future features are compulsory, the version number must be updated
+ to prevent old code from accessing the database.
+ But if the future feature is optional, we have no way of telling if older
+ code is accessing the database or not.
+\end_layout
+
+\begin_layout Subsubsection
+
+\change_inserted 0 1284015637
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 0 1284016114
+The header should contain a 
+\begin_inset Quotes eld
+\end_inset
+
+format variant
+\begin_inset Quotes erd
+\end_inset
+
+ value (64-bit).
+ This is divided into two 32-bit parts:
+\end_layout
+
+\begin_layout Enumerate
+
+\change_inserted 0 1284016149
+The lower part reflects the format variant understood by code accessing
+ the database.
+\end_layout
+
+\begin_layout Enumerate
+
+\change_inserted 0 1284016639
+The upper part reflects the format variant you must understand to write
+ to the database (otherwise you can only open for reading).
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 0 1284016821
+The latter field can only be written at creation time, the former should
+ be written under the OPEN_LOCK when opening the database for writing, if
+ the variant of the code is lower than the current lowest variant.
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 0 1284016803
+This should allow backwards-compatible features to be added, and detection
+ if older code (which doesn't understand the feature) writes to the database.
+\change_deleted 0 1284016101
+
+\end_layout
+
+\begin_layout Subsection
+
+\change_inserted 0 1284015634
+Record Headers Are Not Expandible
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 0 1284015634
+If we later want to add (say) checksums on keys and data, it would require
+ another format change, which we'd like to avoid.
+\end_layout
+
+\begin_layout Subsubsection
+
+\change_inserted 0 1284015634
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 0 1284422552
+We often have extra padding at the tail of a record.
+ If we ensure that the first byte (if any) of this padding is zero, we will
+ have a way for future changes to detect code which doesn't understand a
+ new format: the new code would write (say) a 1 at the tail, and thus if
+ there is no tail or the first byte is 0, we would know the extension is
+ not present on that record.
+\end_layout
+
+\begin_layout Subsection
+
+\change_inserted 0 1284422568
+TDB Does Not Use Talloc
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 0 1284422646
+Many users of TDB (particularly Samba) use the talloc allocator, and thus
+ have to wrap TDB in a talloc context to use it conveniently.
+\end_layout
+
+\begin_layout Subsubsection
+
+\change_inserted 0 1284422656
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 0 1284423065
+The allocation within TDB is not complicated enough to justify the use of
+ talloc, and I am reluctant to force another (excellent) library on TDB
+ users.
+ Nonetheless a compromise is possible.
+ An attribute (see 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "attributes"
+
+\end_inset
+
+) can be added later to tdb_open() to provide an alternate allocation mechanism,
+ specifically for talloc but usable by any other allocator (which would
+ ignore the 
+\begin_inset Quotes eld
+\end_inset
+
+context
+\begin_inset Quotes erd
+\end_inset
+
+ argument).
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 0 1284423042
+This would form a talloc heirarchy as expected, but the caller would still
+ have to attach a destructor to the tdb context returned from tdb_open to
+ close it.
+ All TDB_DATA fields would be children of the tdb_context, and the caller
+ would still have to manage them (using talloc_free() or talloc_steal()).
+\change_unchanged
+
  \end_layout
  
  \begin_layout Section
@@ -1324,6 +1512,16 @@ TDB contains a number of hash chains in the header; the number is specified
  \end_layout
  
  \begin_layout Subsubsection
+
+\change_inserted 0 1283336713
+\begin_inset CommandInset label
+LatexCommand label
+name "sub:Hash-Size-Solution"
+
+\end_inset
+
+
+\change_unchanged
  Proposed Solution
  \end_layout
  
@@ -1342,22 +1540,32 @@ http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoyin
  
  , it became clear that it is hard to beat a straight linear hash table which
   doubles in size when it reaches saturation.
- There are three details which become important:
+ 
+\change_deleted 0 1283307675
+There are three details which become important:
  \end_layout
  
  \begin_layout Enumerate
+
+\change_deleted 0 1283307675
  On encountering a full bucket, we use the next bucket.
  \end_layout
  
  \begin_layout Enumerate
+
+\change_deleted 0 1283307675
  Extra hash bits are stored with the offset, to reduce comparisons.
  \end_layout
  
  \begin_layout Enumerate
+
+\change_deleted 0 1283307675
  A marker entry is used on deleting an entry.
  \end_layout
  
  \begin_layout Standard
+
+\change_deleted 0 1283307675
  The doubling of the table must be done under a transaction; we will not
   reduce it on deletion, so it will be an unusual case.
   It will either be placed at the head (other entries will be moved out the
@@ -1368,6 +1576,8 @@ The doubling of the table must be done under a transaction; we will not
  \end_layout
  
  \begin_layout Standard
+
+\change_deleted 0 1283307675
  The locking for this is slightly more complex than the chained case; we
   currently have one lock per bucket, and that means we would need to expand
   the lock if we overflow to the next bucket.
@@ -1376,8 +1586,43 @@ The locking for this is slightly more complex than the chained case; we
  \end_layout
  
  \begin_layout Standard
+
+\change_deleted 0 1283307675
  One possible optimization is to only re-check the hash size on an insert
   or a lookup miss.
+
+\change_inserted 0 1283307770
+ Unfortunately, altering the hash table introduces serious locking complications
+: the entire hash table needs to be locked to enlarge the hash table, and
+ others might be holding locks.
+ Particularly insidious are insertions done under tdb_chainlock.
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 0 1283336187
+Thus an expanding layered hash will be used: an array of hash groups, with
+ each hash group exploding into pointers to lower hash groups once it fills,
+ turning into a hash tree.
+ This has implications for locking: we must lock the entire group in case
+ we need to expand it, yet we don't know how deep the tree is at that point.
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 0 1283336586
+Note that bits from the hash table entries should be stolen to hold more
+ hash bits to reduce the penalty of collisions.
+ We can use the otherwise-unused lower 3 bits.
+ If we limit the size of the database to 64 exabytes, we can use the top
+ 8 bits of the hash entry as well.
+ These 11 bits would reduce false positives down to 1 in 2000 which is more
+ than we need: we can use one of the bits to indicate that the extra hash
+ bits are valid.
+ This means we can choose not to re-hash all entries when we expand a hash
+ group; simply use the next bits we need and mark them invalid.
+\change_unchanged
+
  \end_layout
  
  \begin_layout Subsection
@@ -1504,6 +1749,8 @@ The single list lock limits our allocation rate; due to the other issues
  
  \begin_layout Subsubsection
  Proposed Solution
+\change_deleted 0 1283336858
+
  \end_layout
  
  \begin_layout Standard
@@ -1518,6 +1765,20 @@ The free list must be split to reduce contention.
   This implies that the number of free lists is related to the size of the
   hash table, but as it is rare to walk a large number of free list entries
   we can use far fewer, say 1/32 of the number of hash buckets.
+\change_inserted 0 1283336910
+
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 0 1283337052
+It seems tempting to try to reuse the hash implementation which we use for
+ records here, but we have two ways of searching for free entries: for allocatio
+n we search by size (and possibly zone) which produces too many clashes
+ for our hash table to handle well, and for coalescing we search by address.
+ Thus an array of doubly-linked free lists seems preferable.
+\change_unchanged
+
  \end_layout
  
  \begin_layout Standard
@@ -1639,6 +1900,70 @@ reference "sub:Records-Incur-A"
  I anticipate that the number of entries in each free zone would be small,
   but it might be worth using one free entry to hold pointers to the others
   for cache efficiency.
+\change_inserted 0 1283309850
+
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 0 1283337216
+\begin_inset CommandInset label
+LatexCommand label
+name "freelist-in-zone"
+
+\end_inset
+
+If we want to avoid locking complexity (enlarging the free lists when we
+ enlarge the file) we could place the array of free lists at the beginning
+ of each zone.
+ This means existing array lists never move, but means that a record cannot
+ be larger than a zone.
+ That in turn implies that zones should be variable sized (say, power of
+ 2), which makes the question 
+\begin_inset Quotes eld
+\end_inset
+
+what zone is this record in?
+\begin_inset Quotes erd
+\end_inset
+
+ much harder (and 
+\begin_inset Quotes eld
+\end_inset
+
+pick a random zone
+\begin_inset Quotes erd
+\end_inset
+
+, but that's less common).
+ It could be done with as few as 4 bits from the record header.
+\begin_inset Foot
+status open
+
+\begin_layout Plain Layout
+
+\change_inserted 0 1284424151
+Using 
+\begin_inset Formula $2^{16+N*3}$
+\end_inset
+
+means 0 gives a minimal 65536-byte zone, 15 gives the maximal 
+\begin_inset Formula $2^{61}$
+\end_inset
+
+ byte zone.
+ Zones range in factor of 8 steps.
+ Given the zone size for the zone the current record is in, we can determine
+ the start of the zone.
+\change_unchanged
+
+\end_layout
+
+\end_inset
+
+
+\change_unchanged
+
  \end_layout
  
  \begin_layout Subsection
@@ -1840,6 +2165,19 @@ miss
   it reduces 99.9% of false memcmp).
   As an aside, as the lower bits are already incorporated in the hash table
   resolution, the upper bits should be used here.
+
+\change_inserted 0 1283336739
+ Note that it's not clear that these bits will be a win, given the extra
+ bits in the hash table itself (see 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "sub:Hash-Size-Solution"
+
+\end_inset
+
+).
+\change_unchanged
+
  \end_layout
  
  \begin_layout Enumerate
@@ -1917,6 +2255,16 @@ struct tdb_free_record {
  
  \begin_layout LyX-Code
          uint64_t total_length;
+\change_inserted 0 1283337133
+
+\end_layout
+
+\begin_layout LyX-Code
+
+\change_inserted 0 1283337139
+        uint64_t prev, next;
+\change_unchanged
+
  \end_layout
  
  \begin_layout LyX-Code
@@ -1931,7 +2279,21 @@ struct tdb_free_record {
  };
  \end_layout
  
-\begin_layout LyX-Code
+\begin_layout Standard
+
+\change_inserted 0 1283337235
+We might want to take some bits from the used record's top_hash (and the
+ free record which has 32 bits of padding to spare anyway) if we use variable
+ sized zones.
+ See 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "freelist-in-zone"
+
+\end_inset
+
+.
+\change_unchanged
  
  \end_layout
  
@@ -2037,6 +2399,8 @@ TDB Does Not Have Snapshot Support
  
  \begin_layout Subsubsection
  Proposed Solution
+\change_deleted 0 1284423472
+
  \end_layout
  
  \begin_layout Standard
@@ -2049,7 +2413,23 @@ use a real database
  \begin_inset Quotes erd
  \end_inset
  
+
+\change_inserted 0 1284423891
+ 
+\change_deleted 0 1284423891
  .
+
+\change_inserted 0 1284423901
+ (but see 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "replay-attribute"
+
+\end_inset
+
+).
+\change_unchanged
+
  \end_layout
  
  \begin_layout Standard
@@ -2072,6 +2452,8 @@ This would not allow arbitrary changes to the database, such as tdb_repack
  \begin_layout Standard
  We could then implement snapshots using a similar method, using multiple
   different hash tables/free tables.
+\change_inserted 0 1284423495
+
  \end_layout
  
  \begin_layout Subsection
@@ -2091,6 +2473,18 @@ Proposed Solution
  \end_layout
  
  \begin_layout Standard
+
+\change_inserted 0 1284424201
+None (but see 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "replay-attribute"
+
+\end_inset
+
+).
+ 
+\change_unchanged
  We could solve a small part of the problem by providing read-only transactions.
   These would allow one write transaction to begin, but it could not commit
   until all r/o transactions are done.
@@ -2276,6 +2670,53 @@ At some later point, a sync would allow recovery of the old data into the
   free lists (perhaps when the array of top-level pointers filled).
   On crash, tdb_open() would examine the array of top levels, and apply the
   transactions until it encountered an invalid checksum.
+\change_inserted 0 1284423555
+
+\end_layout
+
+\begin_layout Subsection
+
+\change_inserted 0 1284423617
+Tracing Is Fragile, Replay Is External
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 0 1284423719
+The current TDB has compile-time-enabled tracing code, but it often breaks
+ as it is not enabled by default.
+ In a similar way, the ctdb code has an external wrapper which does replay
+ tracing so it can coordinate cluster-wide transactions.
+\end_layout
+
+\begin_layout Subsubsection
+
+\change_inserted 0 1284423864
+Proposed Solution
+\begin_inset CommandInset label
+LatexCommand label
+name "replay-attribute"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+
+\change_inserted 0 1284423850
+Tridge points out that an attribute can be later added to tdb_open (see
+ 
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "attributes"
+
+\end_inset
+
+) to provide replay/trace hooks, which could become the basis for this and
+ future parallel transactions and snapshot support.
+\change_unchanged
+
  \end_layout
  
  \end_body