-head 1.9;
+head 1.13;
access;
symbols;
locks; strict;
comment @# @;
+1.13
+date 2011.03.01.11.46.54; author rusty; state Exp;
+branches;
+next 1.12;
+
+1.12
+date 2010.12.01.12.20.49; author rusty; state Exp;
+branches;
+next 1.11;
+
+1.11
+date 2010.12.01.11.55.20; author rusty; state Exp;
+branches;
+next 1.10;
+
+1.10
+date 2010.09.14.00.33.57; author rusty; state Exp;
+branches;
+next 1.9;
+
1.9
date 2010.09.09.07.25.12; author rusty; state Exp;
branches;
@
-1.9
+1.13
log
-@Extension mechanism.
+@Thread-safe API
@
text
-@#LyX 1.6.5 created this file. For more info see http://www.lyx.org/
+@#LyX 1.6.7 created this file. For more info see http://www.lyx.org/
\lyxformat 345
\begin_document
\begin_header
\end_layout
\begin_layout Date
-
-\change_deleted 0 1283307542
-26-July
-\change_inserted 0 1284016854
-9-September
-\change_unchanged
--2010
+1-December-2010
\end_layout
\begin_layout Abstract
\begin_layout Subsubsection
Proposed Solution
+\begin_inset CommandInset label
+LatexCommand label
+name "attributes"
+
+\end_inset
+
+
\end_layout
\begin_layout Standard
of the union.
\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
\begin_layout Subsection
tdb_traverse Makes Impossible Guarantees
\end_layout
You can prevent changes by using a transaction or the locking API.
\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+ Delete-during-traverse will still delete every record, too (assuming no
+ other changes).
+\end_layout
+
\begin_layout Subsection
Nesting of Transactions Is Fraught
\end_layout
-obscure case.
\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1298979572
+Incomplete; nesting flag is still defined as per tdb1.
+\change_inserted 0 1298979584
+Complete; the nesting flag has been removed.
+\change_unchanged
+
+\end_layout
+
\begin_layout Subsection
Incorrect Hash Function is Not Detected
\end_layout
hash function produces the same answer, or fail the tdb_open call.
\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
\begin_layout Subsection
tdb_set_max_dead/TDB_VOLATILE Expose Implementation
\end_layout
tuning, but initially will become a no-op.
\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+ TDB_VOLATILE still defined, but implementation should fail on unknown flags
+ to be future-proof.
+\end_layout
+
\begin_layout Subsection
\begin_inset CommandInset label
LatexCommand label
an API.
\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
\begin_layout Subsection
TDB API Is Not POSIX Thread-safe
\end_layout
\begin_layout Standard
Reachitecting the API to include a tdb_errcode pointer would be a great
- deal of churn; we are better to guarantee that the tdb_errcode is per-thread
- so the current programming model can be maintained.
+ deal of churn
+\change_inserted 0 1298979557
+, but fortunately most functions return 0 on success and -1 on error: we
+ can change these to return 0 on success and a negative error code on error,
+ and the API remains similar to previous.
+ The tdb_fetch, tdb_firstkey and tdb_nextkey functions need to take a TDB_DATA
+ pointer and return an error code.
+ It is also simpler to have tdb_nextkey replace its key argument in place,
+ freeing up any old .dptr.
\end_layout
\begin_layout Standard
+
+\change_deleted 0 1298979438
+; we are better to guarantee that the tdb_errcode is per-thread so the current
+ programming model can be maintained.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1298979438
This requires dynamic per-thread allocations, which is awkward with POSIX
threads (pthread_key_create space is limited and we cannot simply allocate
a key for every TDB).
+\change_unchanged
+
\end_layout
\begin_layout Standard
\begin_layout Standard
The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
version of the library, and otherwise no overhead will exist.
-
-\change_inserted 0 1284016998
Alternatively, a hooking mechanism similar to that proposed for
\begin_inset CommandInset ref
LatexCommand ref
\end_inset
could be used to enable pthread locking at runtime.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete
+\change_inserted 0 1298979681
+; API has been changed but thread safety has not been implemented.
+\change_deleted 0 1298979669
+.
\change_unchanged
\end_layout
It also keeps the complexity out of the API, and in ctdbd where it is needed.
\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
\begin_layout Subsection
tdb_chainlock Functions Expose Implementation
\end_layout
\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
\begin_layout Subsection
The API Uses Gratuitous Typedefs, Capitals
\end_layout
the API/ABI.
\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
\begin_layout Subsection
Various Callback Functions Are Not Typesafe
\end_layout
See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
\begin_layout Subsection
TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
\end_layout
\end_inset
.
-\change_inserted 0 1284015637
+\end_layout
+\begin_layout Subsubsection
+Status
\end_layout
-\begin_layout Subsection
+\begin_layout Standard
-\change_inserted 0 1284015716
+\change_deleted 0 1298979699
+Incomplete, TDB_CLEAR_IF_FIRST still defined, but not implemented.
+\change_inserted 0 1298979700
+Complete.
+\change_unchanged
+
+\end_layout
+
+\begin_layout Subsection
Extending The Header Is Difficult
\end_layout
\begin_layout Standard
-
-\change_inserted 0 1284015906
We have reserved (zeroed) words in the TDB header, which can be used for
future features.
If the future features are compulsory, the version number must be updated
\end_layout
\begin_layout Subsubsection
-
-\change_inserted 0 1284015637
Proposed Solution
\end_layout
\begin_layout Standard
-
-\change_inserted 0 1284016114
The header should contain a
\begin_inset Quotes eld
\end_inset
\end_layout
\begin_layout Enumerate
-
-\change_inserted 0 1284016149
The lower part reflects the format variant understood by code accessing
the database.
\end_layout
\begin_layout Enumerate
-
-\change_inserted 0 1284016639
The upper part reflects the format variant you must understand to write
to the database (otherwise you can only open for reading).
\end_layout
\begin_layout Standard
-
-\change_inserted 0 1284016821
The latter field can only be written at creation time, the former should
be written under the OPEN_LOCK when opening the database for writing, if
the variant of the code is lower than the current lowest variant.
\end_layout
\begin_layout Standard
-
-\change_inserted 0 1284016803
This should allow backwards-compatible features to be added, and detection
if older code (which doesn't understand the feature) writes to the database.
-\change_deleted 0 1284016101
+\end_layout
+\begin_layout Subsubsection
+Status
\end_layout
-\begin_layout Subsection
+\begin_layout Standard
+Incomplete.
+\end_layout
-\change_inserted 0 1284015634
+\begin_layout Subsection
Record Headers Are Not Expandible
\end_layout
\begin_layout Standard
-
-\change_inserted 0 1284015634
If we later want to add (say) checksums on keys and data, it would require
another format change, which we'd like to avoid.
\end_layout
\begin_layout Subsubsection
-
-\change_inserted 0 1284015634
Proposed Solution
\end_layout
\begin_layout Standard
-
-\change_inserted 0 1284016847
We often have extra padding at the tail of a record.
If we ensure that the first byte (if any) of this padding is zero, we will
have a way for future changes to detect code which doesn't understand a
new format: the new code would write (say) a 1 at the tail, and thus if
there is no tail or the first byte is 0, we would know the extension is
not present on that record.
-\change_unchanged
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Incomplete.
+\end_layout
+
+\begin_layout Subsection
+TDB Does Not Use Talloc
+\end_layout
+
+\begin_layout Standard
+Many users of TDB (particularly Samba) use the talloc allocator, and thus
+ have to wrap TDB in a talloc context to use it conveniently.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\end_layout
+
+\begin_layout Standard
+The allocation within TDB is not complicated enough to justify the use of
+ talloc, and I am reluctant to force another (excellent) library on TDB
+ users.
+ Nonetheless a compromise is possible.
+ An attribute (see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "attributes"
+
+\end_inset
+
+) can be added later to tdb_open() to provide an alternate allocation mechanism,
+ specifically for talloc but usable by any other allocator (which would
+ ignore the
+\begin_inset Quotes eld
+\end_inset
+
+context
+\begin_inset Quotes erd
+\end_inset
+
+ argument).
+\end_layout
+
+\begin_layout Standard
+This would form a talloc heirarchy as expected, but the caller would still
+ have to attach a destructor to the tdb context returned from tdb_open to
+ close it.
+ All TDB_DATA fields would be children of the tdb_context, and the caller
+ would still have to manage them (using talloc_free() or talloc_steal()).
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Deferred.
\end_layout
\begin_layout Section
point.
\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1298979837
+Incomplete; TDB_CLEAR_IF_FIRST still defined, but does nothing.
+\change_inserted 0 1298979837
+Complete.
+\change_unchanged
+
+\end_layout
+
\begin_layout Subsection
TDB Files Have a 4G Limit
\end_layout
be erased and initialized as a fresh tdb!)
\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
\begin_layout Subsection
TDB Records Have a 4G Limit
\end_layout
).
\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
\begin_layout Subsection
Hash Size Is Determined At TDB Creation Time
\end_layout
\end_layout
\begin_layout Subsubsection
-
-\change_inserted 0 1283336713
\begin_inset CommandInset label
LatexCommand label
name "sub:Hash-Size-Solution"
\end_inset
-
-\change_unchanged
Proposed Solution
\end_layout
, it became clear that it is hard to beat a straight linear hash table which
doubles in size when it reaches saturation.
-
-\change_deleted 0 1283307675
-There are three details which become important:
+ Unfortunately, altering the hash table introduces serious locking complications
+: the entire hash table needs to be locked to enlarge the hash table, and
+ others might be holding locks.
+ Particularly insidious are insertions done under tdb_chainlock.
\end_layout
-\begin_layout Enumerate
+\begin_layout Standard
+Thus an expanding layered hash will be used: an array of hash groups, with
+ each hash group exploding into pointers to lower hash groups once it fills,
+ turning into a hash tree.
+ This has implications for locking: we must lock the entire group in case
+ we need to expand it, yet we don't know how deep the tree is at that point.
+\end_layout
-\change_deleted 0 1283307675
-On encountering a full bucket, we use the next bucket.
+\begin_layout Standard
+Note that bits from the hash table entries should be stolen to hold more
+ hash bits to reduce the penalty of collisions.
+ We can use the otherwise-unused lower 3 bits.
+ If we limit the size of the database to 64 exabytes, we can use the top
+ 8 bits of the hash entry as well.
+ These 11 bits would reduce false positives down to 1 in 2000 which is more
+ than we need: we can use one of the bits to indicate that the extra hash
+ bits are valid.
+ This means we can choose not to re-hash all entries when we expand a hash
+ group; simply use the next bits we need and mark them invalid.
\end_layout
-\begin_layout Enumerate
+\begin_layout Subsubsection
+Status
+\end_layout
-\change_deleted 0 1283307675
-Extra hash bits are stored with the offset, to reduce comparisons.
+\begin_layout Standard
+Complete.
\end_layout
-\begin_layout Enumerate
+\begin_layout Subsection
+\begin_inset CommandInset label
+LatexCommand label
+name "TDB-Freelist-Is"
-\change_deleted 0 1283307675
-A marker entry is used on deleting an entry.
+\end_inset
+
+TDB Freelist Is Highly Contended
\end_layout
\begin_layout Standard
-
-\change_deleted 0 1283307675
-The doubling of the table must be done under a transaction; we will not
- reduce it on deletion, so it will be an unusual case.
- It will either be placed at the head (other entries will be moved out the
- way so we can expand).
- We could have a pointer in the header to the current hashtable location,
- but that pointer would have to be read frequently to check for hashtable
- moves.
-\end_layout
-
-\begin_layout Standard
-
-\change_deleted 0 1283307675
-The locking for this is slightly more complex than the chained case; we
- currently have one lock per bucket, and that means we would need to expand
- the lock if we overflow to the next bucket.
- The frequency of such collisions will effect our locking heuristics: we
- can always lock more buckets than we need.
-\end_layout
-
-\begin_layout Standard
-
-\change_deleted 0 1283307675
-One possible optimization is to only re-check the hash size on an insert
- or a lookup miss.
-
-\change_inserted 0 1283307770
- Unfortunately, altering the hash table introduces serious locking complications
-: the entire hash table needs to be locked to enlarge the hash table, and
- others might be holding locks.
- Particularly insidious are insertions done under tdb_chainlock.
-\end_layout
-
-\begin_layout Standard
-
-\change_inserted 0 1283336187
-Thus an expanding layered hash will be used: an array of hash groups, with
- each hash group exploding into pointers to lower hash groups once it fills,
- turning into a hash tree.
- This has implications for locking: we must lock the entire group in case
- we need to expand it, yet we don't know how deep the tree is at that point.
-\end_layout
-
-\begin_layout Standard
-
-\change_inserted 0 1283336586
-Note that bits from the hash table entries should be stolen to hold more
- hash bits to reduce the penalty of collisions.
- We can use the otherwise-unused lower 3 bits.
- If we limit the size of the database to 64 exabytes, we can use the top
- 8 bits of the hash entry as well.
- These 11 bits would reduce false positives down to 1 in 2000 which is more
- than we need: we can use one of the bits to indicate that the extra hash
- bits are valid.
- This means we can choose not to re-hash all entries when we expand a hash
- group; simply use the next bits we need and mark them invalid.
-\change_unchanged
-
-\end_layout
-
-\begin_layout Subsection
-\begin_inset CommandInset label
-LatexCommand label
-name "TDB-Freelist-Is"
-
-\end_inset
-
-TDB Freelist Is Highly Contended
-\end_layout
-
-\begin_layout Standard
-TDB uses a single linked list for the free list.
- Allocation occurs as follows, using heuristics which have evolved over
- time:
-\end_layout
+TDB uses a single linked list for the free list.
+ Allocation occurs as follows, using heuristics which have evolved over
+ time:
+\end_layout
\begin_layout Enumerate
Get the free list lock for this whole operation.
\begin_layout Subsubsection
Proposed Solution
-\change_deleted 0 1283336858
-
\end_layout
\begin_layout Standard
This implies that the number of free lists is related to the size of the
hash table, but as it is rare to walk a large number of free list entries
we can use far fewer, say 1/32 of the number of hash buckets.
-\change_inserted 0 1283336910
-
\end_layout
\begin_layout Standard
-
-\change_inserted 0 1283337052
It seems tempting to try to reuse the hash implementation which we use for
records here, but we have two ways of searching for free entries: for allocatio
n we search by size (and possibly zone) which produces too many clashes
for our hash table to handle well, and for coalescing we search by address.
Thus an array of doubly-linked free lists seems preferable.
-\change_unchanged
-
\end_layout
\begin_layout Standard
) but it's not clear this would reduce contention in the common case where
all processes are allocating/freeing the same size.
Thus we almost certainly need to divide in other ways: the most obvious
- is to divide the file into zones, and using a free list (or set of free
+ is to divide the file into zones, and using a free list (or table of free
lists) for each.
This approximates address ordering.
\end_layout
\begin_layout Standard
-Note that this means we need to split the free lists when we expand the
- file; this is probably acceptable when we double the hash table size, since
- that is such an expensive operation already.
- In the case of increasing the file size, there is an optimization we can
- use: if we use M in the formula above as the file size rounded up to the
- next power of 2, we only need reshuffle free lists when the file size crosses
- a power of 2 boundary,
-\emph on
-and
-\emph default
-reshuffling the free lists is trivial: we simply merge every consecutive
- pair of free lists.
+Unfortunately it is difficult to know what heuristics should be used to
+ determine zone sizes, and our transaction code relies on being able to
+ create a
+\begin_inset Quotes eld
+\end_inset
+
+recovery area
+\begin_inset Quotes erd
+\end_inset
+
+ by simply appending to the file (difficult if it would need to create a
+ new zone header).
+ Thus we use a linked-list of free tables; currently we only ever create
+ one, but if there is more than one we choose one at random to use.
+ In future we may use heuristics to add new free tables on contention.
+ We only expand the file when all free tables are exhausted.
\end_layout
\begin_layout Standard
\end_layout
\begin_layout Enumerate
-Identify the correct zone.
+Identify the correct free list.
\end_layout
\begin_layout Enumerate
\end_layout
\begin_layout Enumerate
-Re-check the zone (we didn't have a lock, sizes could have changed): relock
+Re-check the list (we didn't have a lock, sizes could have changed): relock
if necessary.
\end_layout
\begin_layout Enumerate
-Place the freed entry in the list for that zone.
+Place the freed entry in the list.
\end_layout
\begin_layout Standard
\end_layout
\begin_layout Enumerate
-Pick a zone either the zone we last freed into, or based on a
-\begin_inset Quotes eld
-\end_inset
-
-random
-\begin_inset Quotes erd
-\end_inset
-
- number.
+Pick a free table; usually the previous one.
\end_layout
\begin_layout Enumerate
\end_layout
\begin_layout Enumerate
-Re-check the zone: relock if necessary.
+If the top entry is -large enough, remove it from the list and return it.
\end_layout
\begin_layout Enumerate
-If the top entry is -large enough, remove it from the list and return it.
+Otherwise, coalesce entries in the list.If there was no entry large enough,
+ unlock the list and try the next largest list
\end_layout
\begin_layout Enumerate
-Otherwise, coalesce entries in the list.If there was no entry large enough,
- unlock the list and try the next zone.
+If no list has an entry which meets our needs, try the next free table.
\end_layout
\begin_layout Enumerate
\end_layout
\begin_layout Standard
-I anticipate that the number of entries in each free zone would be small,
- but it might be worth using one free entry to hold pointers to the others
- for cache efficiency.
-\change_inserted 0 1283309850
-
-\end_layout
-
-\begin_layout Standard
-
-\change_inserted 0 1283337216
-\begin_inset CommandInset label
-LatexCommand label
-name "freelist-in-zone"
-
-\end_inset
-
-If we want to avoid locking complexity (enlarging the free lists when we
- enlarge the file) we could place the array of free lists at the beginning
- of each zone.
- This means existing array lists never move, but means that a record cannot
- be larger than a zone.
- That in turn implies that zones should be variable sized (say, power of
- 2), which makes the question
-\begin_inset Quotes eld
-\end_inset
-
-what zone is this record in?
-\begin_inset Quotes erd
-\end_inset
-
- much harder (and
-\begin_inset Quotes eld
-\end_inset
-
-pick a random zone
-\begin_inset Quotes erd
-\end_inset
-
-, but that's less common).
- It could be done with as few as 4 bits from the record header.
-\begin_inset Foot
-status open
-
-\begin_layout Plain Layout
-
-\change_inserted 0 1283310945
-Using
-\begin_inset Formula $2^{16+N*3}$
-\end_inset
-
-means 0 gives a minimal 65536-byte zone, 15 gives the maximal
-\begin_inset Formula $2^{61}$
-\end_inset
-
- byte zone.
- Zones range in factor of 8 steps.
-\change_unchanged
-
-\end_layout
-
-\end_inset
-
-
-\change_unchanged
-
+Each free entry has the free table number in the header: less than 255.
+ It also contains a doubly-linked list for easy deletion.
\end_layout
\begin_layout Subsection
it reduces 99.9% of false memcmp).
As an aside, as the lower bits are already incorporated in the hash table
resolution, the upper bits should be used here.
-
-\change_inserted 0 1283336739
Note that it's not clear that these bits will be a win, given the extra
bits in the hash table itself (see
\begin_inset CommandInset ref
\end_inset
).
-\change_unchanged
-
\end_layout
\begin_layout Enumerate
\end_layout
\begin_layout LyX-Code
- uint32_t magic : 16,
+ uint32_t used_magic : 16,
\end_layout
\begin_layout LyX-Code
- prev_is_free: 1,
+
\end_layout
\begin_layout LyX-Code
\end_layout
\begin_layout LyX-Code
- top_hash: 10;
+ top_hash: 11;
\end_layout
\begin_layout LyX-Code
\end_layout
\begin_layout LyX-Code
- uint32_t free_magic;
+ uint64_t free_magic: 8,
\end_layout
\begin_layout LyX-Code
- uint64_t total_length;
-\change_inserted 0 1283337133
-
+ prev : 56;
\end_layout
\begin_layout LyX-Code
-\change_inserted 0 1283337139
- uint64_t prev, next;
-\change_unchanged
+\end_layout
+\begin_layout LyX-Code
+ uint64_t free_table: 8,
\end_layout
\begin_layout LyX-Code
- ...
+ total_length : 56
\end_layout
\begin_layout LyX-Code
- uint64_t tailer;
+ uint64_t next;;
\end_layout
\begin_layout LyX-Code
\begin_layout Standard
-\change_inserted 0 1283337235
-We might want to take some bits from the used record's top_hash (and the
- free record which has 32 bits of padding to spare anyway) if we use variable
- sized zones.
- See
-\begin_inset CommandInset ref
-LatexCommand ref
-reference "freelist-in-zone"
-
-\end_inset
-
-.
+\change_deleted 0 1291206079
+
\change_unchanged
+Note that by limiting valid offsets to 56 bits, we can pack everything we
+ need into 3 64-byte words, meaning our minimum record size is 8 bytes.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+\begin_layout Standard
+Complete.
\end_layout
\begin_layout Subsection
a transaction in progress; we need only check for recovery if this is set.
\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
\begin_layout Subsection
\begin_inset CommandInset label
LatexCommand label
\end_layout
\begin_layout Subsubsection
-Proposed Solution
-\end_layout
-
-\begin_layout Standard
-None.
+Proposed SolutionNone.
At some point you say
\begin_inset Quotes eld
\end_inset
\begin_inset Quotes erd
\end_inset
-.
+ (but see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "replay-attribute"
+
+\end_inset
+
+).
\end_layout
\begin_layout Standard
different hash tables/free tables.
\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
\begin_layout Subsection
Transactions Cannot Operate in Parallel
\end_layout
\end_layout
\begin_layout Standard
-We could solve a small part of the problem by providing read-only transactions.
+None (but see
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "replay-attribute"
+
+\end_inset
+
+).
+ We could solve a small part of the problem by providing read-only transactions.
These would allow one write transaction to begin, but it could not commit
until all r/o transactions are done.
This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
commit.
\end_layout
-\begin_layout Subsection
-Default Hash Function Is Suboptimal
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
+\begin_layout Subsection
+Default Hash Function Is Suboptimal
\end_layout
\begin_layout Standard
hash bombing.
\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
\begin_layout Subsection
\begin_inset CommandInset label
LatexCommand label
.
\end_layout
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Complete.
+\end_layout
+
\begin_layout Subsection
Fcntl Locking Adds Overhead
\end_layout
transactions until it encountered an invalid checksum.
\end_layout
+\begin_layout Subsection
+Tracing Is Fragile, Replay Is External
+\end_layout
+
+\begin_layout Standard
+The current TDB has compile-time-enabled tracing code, but it often breaks
+ as it is not enabled by default.
+ In a similar way, the ctdb code has an external wrapper which does replay
+ tracing so it can coordinate cluster-wide transactions.
+\end_layout
+
+\begin_layout Subsubsection
+Proposed Solution
+\begin_inset CommandInset label
+LatexCommand label
+name "replay-attribute"
+
+\end_inset
+
+
+\end_layout
+
+\begin_layout Standard
+Tridge points out that an attribute can be later added to tdb_open (see
+
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "attributes"
+
+\end_inset
+
+) to provide replay/trace hooks, which could become the basis for this and
+ future parallel transactions and snapshot support.
+\end_layout
+
+\begin_layout Subsubsection
+Status
+\end_layout
+
+\begin_layout Standard
+Deferred.
+\end_layout
+
\end_body
\end_document
@
+1.12
+log
+@Add status, some fixes, linked freelists.
+@
+text
+@d53 1
+a53 7
+
+\change_deleted 0 1291204535
+14-September
+\change_inserted 0 1291204533
+1-December
+\change_unchanged
+-2010
+a580 2
+\change_inserted 0 1291204563
+
+a583 2
+
+\change_inserted 0 1291204572
+a587 2
+
+\change_inserted 0 1291204573
+a588 2
+\change_unchanged
+
+a629 2
+\change_inserted 0 1291204588
+
+a632 2
+
+\change_inserted 0 1291204588
+a636 2
+
+\change_inserted 0 1291204631
+a639 2
+\change_unchanged
+
+a693 2
+\change_inserted 0 1291204639
+
+a696 2
+
+\change_inserted 0 1291204640
+d702 1
+a702 1
+\change_inserted 0 1291204665
+d704 2
+a728 2
+\change_inserted 0 1291204671
+
+a731 2
+
+\change_inserted 0 1291204671
+a735 2
+
+\change_inserted 0 1291204673
+a736 2
+\change_unchanged
+
+a780 2
+\change_inserted 0 1291204731
+
+a783 2
+
+\change_inserted 0 1291204732
+a787 2
+
+\change_inserted 0 1291204779
+a790 2
+\change_unchanged
+
+a842 2
+\change_inserted 0 1291204830
+
+a845 2
+
+\change_inserted 0 1291204831
+a849 2
+
+\change_inserted 0 1291204834
+a850 2
+\change_unchanged
+
+d879 9
+a887 2
+ deal of churn; we are better to guarantee that the tdb_errcode is per-thread
+ so the current programming model can be maintained.
+d891 9
+d903 2
+a922 2
+\change_inserted 0 1291204847
+
+a925 2
+
+\change_inserted 0 1291204847
+d930 5
+a934 3
+
+\change_inserted 0 1291204852
+Incomplete.
+a1051 2
+\change_inserted 0 1291204881
+
+a1054 2
+
+\change_inserted 0 1291204881
+a1058 2
+
+\change_inserted 0 1291204885
+a1059 2
+\change_unchanged
+
+a1140 2
+\change_inserted 0 1291204898
+
+a1143 2
+
+\change_inserted 0 1291204898
+a1147 2
+
+\change_inserted 0 1291204901
+a1148 2
+\change_unchanged
+
+a1224 2
+\change_inserted 0 1291204908
+
+a1227 2
+
+\change_inserted 0 1291204908
+a1231 2
+
+\change_inserted 0 1291204908
+a1232 2
+\change_unchanged
+
+a1271 2
+\change_inserted 0 1291204917
+
+a1274 2
+
+\change_inserted 0 1291204917
+a1278 2
+
+\change_inserted 0 1291204920
+a1279 2
+\change_unchanged
+
+a1316 2
+\change_inserted 0 1291204927
+
+a1319 2
+
+\change_inserted 0 1291204928
+d1325 1
+a1325 1
+\change_inserted 0 1291204942
+d1327 2
+a1381 2
+\change_inserted 0 1291205003
+
+a1384 2
+
+\change_inserted 0 1291205004
+a1388 2
+
+\change_inserted 0 1291205007
+a1411 2
+\change_inserted 0 1291205019
+
+a1414 2
+
+\change_inserted 0 1291205019
+a1418 2
+
+\change_inserted 0 1291205023
+a1419 2
+\change_unchanged
+
+a1465 2
+\change_inserted 0 1291205029
+
+a1468 2
+
+\change_inserted 0 1291205029
+a1472 2
+
+\change_inserted 0 1291206020
+a1473 2
+\change_unchanged
+
+a1528 2
+\change_inserted 0 1291205043
+
+a1531 2
+
+\change_inserted 0 1291205043
+d1537 1
+a1537 1
+\change_inserted 0 1291205057
+d1539 2
+a1589 2
+\change_inserted 0 1291205062
+
+a1592 2
+
+\change_inserted 0 1291205062
+a1596 2
+
+\change_inserted 0 1291205062
+a1597 2
+\change_unchanged
+
+a1626 2
+\change_inserted 0 1291205072
+
+a1629 2
+
+\change_inserted 0 1291205073
+a1633 2
+
+\change_inserted 0 1291205073
+a1634 2
+\change_unchanged
+
+a1674 4
+
+\change_deleted 0 1291204504
+
+\change_unchanged
+a1699 2
+\change_inserted 0 1291205079
+
+a1702 2
+
+\change_inserted 0 1291205080
+a1706 2
+
+\change_inserted 0 1291205080
+a1707 2
+\change_unchanged
+
+a1833 2
+\change_inserted 0 1291205090
+
+d1869 2
+a1870 7
+ is to divide the file into zones, and using a free list (or
+\change_inserted 0 1291205498
+table
+\change_deleted 0 1291205497
+set
+\change_unchanged
+ of free lists) for each.
+a1871 2
+\change_inserted 0 1291205203
+
+a1874 2
+
+\change_inserted 0 1291205358
+a1890 21
+\change_unchanged
+
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1291205198
+Note that this means we need to split the free lists when we expand the
+ file; this is probably acceptable when we double the hash table size, since
+ that is such an expensive operation already.
+ In the case of increasing the file size, there is an optimization we can
+ use: if we use M in the formula above as the file size rounded up to the
+ next power of 2, we only need reshuffle free lists when the file size crosses
+ a power of 2 boundary,
+\emph on
+and
+\emph default
+reshuffling the free lists is trivial: we simply merge every consecutive
+ pair of free lists.
+\change_unchanged
+
+d1899 1
+a1899 7
+Identify the correct
+\change_inserted 0 1291205366
+free list
+\change_deleted 0 1291205364
+zone
+\change_unchanged
+.
+d1907 2
+a1908 7
+Re-check the
+\change_inserted 0 1291205372
+list
+\change_deleted 0 1291205371
+zone
+\change_unchanged
+ (we didn't have a lock, sizes could have changed): relock if necessary.
+d1912 1
+a1912 5
+Place the freed entry in the list
+\change_deleted 0 1291205382
+ for that zone
+\change_unchanged
+.
+d1921 1
+a1921 15
+Pick a
+\change_deleted 0 1291205403
+zone either the zone we last freed into, or based on a
+\begin_inset Quotes eld
+\end_inset
+
+random
+\begin_inset Quotes erd
+\end_inset
+
+ number.
+\change_inserted 0 1291205411
+free table; usually the previous one.
+\change_unchanged
+
+a1925 10
+\change_deleted 0 1291205432
+
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1291205428
+Re-check the zone: relock if necessary.
+\change_unchanged
+
+d1934 1
+a1934 7
+ unlock the list and try the next
+\change_inserted 0 1291205455
+largest list
+\change_deleted 0 1291205452
+zone.
+\change_inserted 0 1291205457
+
+a1937 2
+
+\change_inserted 0 1291205476
+a1938 2
+\change_unchanged
+
+a1966 2
+\change_inserted 0 1291205542
+
+a1969 2
+
+\change_inserted 0 1291205591
+a1971 70
+\change_unchanged
+
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1291205539
+I anticipate that the number of entries in each free zone would be small,
+ but it might be worth using one free entry to hold pointers to the others
+ for cache efficiency.
+\change_unchanged
+
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1291205534
+\begin_inset CommandInset label
+LatexCommand label
+name "freelist-in-zone"
+
+\end_inset
+
+If we want to avoid locking complexity (enlarging the free lists when we
+ enlarge the file) we could place the array of free lists at the beginning
+ of each zone.
+ This means existing array lists never move, but means that a record cannot
+ be larger than a zone.
+ That in turn implies that zones should be variable sized (say, power of
+ 2), which makes the question
+\begin_inset Quotes eld
+\end_inset
+
+what zone is this record in?
+\begin_inset Quotes erd
+\end_inset
+
+ much harder (and
+\begin_inset Quotes eld
+\end_inset
+
+pick a random zone
+\begin_inset Quotes erd
+\end_inset
+
+, but that's less common).
+ It could be done with as few as 4 bits from the record header.
+\begin_inset Foot
+status collapsed
+
+\begin_layout Plain Layout
+Using
+\begin_inset Formula $2^{16+N*3}$
+\end_inset
+
+means 0 gives a minimal 65536-byte zone, 15 gives the maximal
+\begin_inset Formula $2^{61}$
+\end_inset
+
+ byte zone.
+ Zones range in factor of 8 steps.
+ Given the zone size for the zone the current record is in, we can determine
+ the start of the zone.
+\end_layout
+
+\end_inset
+
+
+\change_inserted 0 1291205139
+
+d2218 1
+a2218 5
+ uint32_t
+\change_inserted 0 1291205758
+used_
+\change_unchanged
+magic : 16,
+a2222 4
+\change_deleted 0 1291205693
+ prev_is_free: 1,
+\change_unchanged
+
+d2230 1
+a2230 7
+ top_hash: 1
+\change_inserted 0 1291205704
+1
+\change_deleted 0 1291205704
+0
+\change_unchanged
+;
+d2254 1
+a2254 9
+ uint
+\change_inserted 0 1291205725
+64
+\change_deleted 0 1291205723
+32
+\change_unchanged
+_t
+\change_inserted 0 1291205753
+free_magic: 8,
+a2257 2
+
+\change_inserted 0 1291205746
+a2262 24
+\change_deleted 0 1291205749
+free_magic;
+\change_unchanged
+
+\end_layout
+
+\begin_layout LyX-Code
+ uint64_t
+\change_inserted 0 1291205786
+free_table: 8,
+\end_layout
+
+\begin_layout LyX-Code
+
+\change_inserted 0 1291205788
+
+\change_unchanged
+total_length
+\change_inserted 0 1291205792
+ : 56
+\change_deleted 0 1291205790
+;
+\change_unchanged
+
+d2266 1
+a2266 7
+ uint64_t
+\change_deleted 0 1291205801
+prev,
+\change_unchanged
+next;
+\change_deleted 0 1291205811
+
+d2270 1
+a2270 3
+
+\change_deleted 0 1291205811
+ ...
+d2274 1
+a2274 5
+
+\change_deleted 0 1291205808
+ uint64_t tailer
+\change_unchanged
+;
+d2283 5
+a2287 16
+\change_deleted 0 1291205827
+We might want to take some bits from the used record's top_hash (and the
+ free record which has 32 bits of padding to spare anyway) if we use variable
+ sized zones.
+ See
+\begin_inset CommandInset ref
+LatexCommand ref
+reference "freelist-in-zone"
+
+\end_inset
+
+.
+
+\change_inserted 0 1291205885
+ Note that by limiting valid offsets to 56 bits, we can pack everything
+ we need into 3 64-byte words, meaning our minimum record size is 8 bytes.
+a2290 2
+
+\change_inserted 0 1291205886
+a2294 2
+
+\change_inserted 0 1291205886
+a2295 2
+\change_unchanged
+
+a2385 2
+\change_inserted 0 1291205894
+
+a2388 2
+
+\change_inserted 0 1291205894
+a2392 2
+
+\change_inserted 0 1291205902
+a2393 2
+\change_unchanged
+
+a2415 4
+
+\change_deleted 0 1291204504
+
+\change_unchanged
+a2445 2
+\change_inserted 0 1291205910
+
+a2448 2
+
+\change_inserted 0 1291205910
+a2452 2
+
+\change_inserted 0 1291205914
+a2453 2
+\change_unchanged
+
+a2485 2
+\change_inserted 0 1291205919
+
+a2488 2
+
+\change_inserted 0 1291205919
+a2492 2
+
+\change_inserted 0 1291205922
+a2493 2
+\change_unchanged
+
+a2533 2
+\change_inserted 0 1291205929
+
+a2536 2
+
+\change_inserted 0 1291205929
+a2540 2
+
+\change_inserted 0 1291205929
+a2541 2
+\change_unchanged
+
+a2578 2
+\change_inserted 0 1291205932
+
+a2581 2
+
+\change_inserted 0 1291205933
+a2585 2
+
+\change_inserted 0 1291205933
+a2586 2
+\change_unchanged
+
+a2724 2
+\change_inserted 0 1291205944
+
+a2727 2
+
+\change_inserted 0 1291205945
+a2731 2
+
+\change_inserted 0 1291205948
+a2732 2
+\change_unchanged
+
+@
+
+
+1.11
+log
+@Merge changes
+@
+text
+@d53 7
+a59 1
+14-September-2010
+d587 16
+d644 18
+d716 16
+d753 16
+d813 18
+d883 16
+d953 16
+d1084 16
+d1181 16
+d1273 16
+d1328 16
+d1381 16
+d1447 19
+a1465 2
+ if older code (which doesn't understand the feature) writes to the database.Reco
+rd Headers Are Not Expandible
+d1484 16
+d1546 16
+d1617 16
+d1680 16
+d1725 16
+d1810 16
+d1951 8
+a1958 3
+Proposed SolutionThe first step is to remove all the current heuristics,
+ as they obviously interact, then examine them once the lock contention
+ is addressed.
+d1989 7
+a1995 2
+ is to divide the file into zones, and using a free list (or set of free
+ lists) for each.
+d1997 2
+d2002 25
+d2039 2
+d2049 7
+a2055 1
+Identify the correct zone.
+d2063 7
+a2069 2
+Re-check the zone (we didn't have a lock, sizes could have changed): relock
+ if necessary.
+d2073 5
+a2077 1
+Place the freed entry in the list for that zone.
+d2086 3
+a2088 1
+Pick a zone either the zone we last freed into, or based on a
+d2097 4
+d2105 2
+d2110 2
+d2113 2
+d2123 15
+a2137 1
+ unlock the list and try the next zone.
+d2166 11
+d2180 2
+d2185 2
+d2190 2
+d2223 1
+a2223 1
+status open
+d2243 2
+d2491 5
+a2495 1
+ uint32_t magic : 16,
+d2499 2
+d2502 2
+d2511 7
+a2517 1
+ top_hash: 10;
+d2541 29
+a2569 1
+ uint32_t free_magic;
+d2573 11
+a2583 1
+ uint64_t total_length;
+d2587 7
+a2593 1
+ uint64_t prev, next;
+d2597 2
+d2603 5
+a2607 1
+ uint64_t tailer;
+d2615 2
+d2628 18
+d2736 16
+d2808 16
+d2856 16
+d2912 16
+d2965 16
+d3119 16
+@
+
+
+1.10
+log
+@Tracing attribute, talloc support.
+@
+text
+@d1 1
+a1 1
+#LyX 1.6.5 created this file. For more info see http://www.lyx.org/
+d53 1
+a53 7
+
+\change_deleted 0 1283307542
+26-July
+\change_inserted 0 1284423485
+14-September
+\change_unchanged
+-2010
+a472 2
+\change_inserted 0 1284422789
+
+a479 2
+\change_unchanged
+
+a838 2
+
+\change_inserted 0 1284016998
+a846 2
+\change_unchanged
+
+a1194 2
+\change_inserted 0 1284015637
+
+a1197 2
+
+\change_inserted 0 1284015716
+a1201 2
+
+\change_inserted 0 1284015906
+a1210 2
+
+\change_inserted 0 1284015637
+a1214 2
+
+\change_inserted 0 1284016114
+a1227 2
+
+\change_inserted 0 1284016149
+a1232 2
+
+\change_inserted 0 1284016639
+a1237 2
+
+\change_inserted 0 1284016821
+a1243 2
+
+\change_inserted 0 1284016803
+d1245 2
+a1246 9
+ if older code (which doesn't understand the feature) writes to the database.
+\change_deleted 0 1284016101
+
+\end_layout
+
+\begin_layout Subsection
+
+\change_inserted 0 1284015634
+Record Headers Are Not Expandible
+a1249 2
+
+\change_inserted 0 1284015634
+a1254 2
+
+\change_inserted 0 1284015634
+a1258 2
+
+\change_inserted 0 1284422552
+a1267 2
+
+\change_inserted 0 1284422568
+a1271 2
+
+\change_inserted 0 1284422646
+a1276 2
+
+\change_inserted 0 1284422656
+a1280 2
+
+\change_inserted 0 1284423065
+a1305 2
+
+\change_inserted 0 1284423042
+a1310 2
+\change_unchanged
+
+a1457 2
+
+\change_inserted 0 1283336713
+a1463 2
+
+\change_unchanged
+d1482 2
+d1485 1
+a1485 51
+\change_deleted 0 1283307675
+There are three details which become important:
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1283307675
+On encountering a full bucket, we use the next bucket.
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1283307675
+Extra hash bits are stored with the offset, to reduce comparisons.
+\end_layout
+
+\begin_layout Enumerate
+
+\change_deleted 0 1283307675
+A marker entry is used on deleting an entry.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1283307675
+The doubling of the table must be done under a transaction; we will not
+ reduce it on deletion, so it will be an unusual case.
+ It will either be placed at the head (other entries will be moved out the
+ way so we can expand).
+ We could have a pointer in the header to the current hashtable location,
+ but that pointer would have to be read frequently to check for hashtable
+ moves.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1283307675
+The locking for this is slightly more complex than the chained case; we
+ currently have one lock per bucket, and that means we would need to expand
+ the lock if we overflow to the next bucket.
+ The frequency of such collisions will effect our locking heuristics: we
+ can always lock more buckets than we need.
+\end_layout
+
+\begin_layout Standard
+
+\change_deleted 0 1283307675
+One possible optimization is to only re-check the hash size on an insert
+ or a lookup miss.
+
+\change_inserted 0 1283307770
+a1492 2
+
+\change_inserted 0 1283336187
+a1500 2
+
+\change_inserted 0 1283336586
+a1510 2
+\change_unchanged
+
+d1636 3
+a1638 8
+Proposed Solution
+\change_deleted 0 1283336858
+
+\end_layout
+
+\begin_layout Standard
+The first step is to remove all the current heuristics, as they obviously
+ interact, then examine them once the lock contention is addressed.
+a1647 2
+\change_inserted 0 1283336910
+
+a1650 2
+
+\change_inserted 0 1283337052
+a1655 2
+\change_unchanged
+
+a1776 2
+\change_inserted 0 1283309850
+
+a1779 2
+
+\change_inserted 0 1283337216
+a1813 2
+
+\change_inserted 0 1284424151
+a1825 2
+\change_unchanged
+
+a1830 2
+\change_unchanged
+
+a2031 2
+
+\change_inserted 0 1283336739
+a2040 2
+\change_unchanged
+
+a2117 2
+\change_inserted 0 1283337133
+
+a2120 2
+
+\change_inserted 0 1283337139
+a2121 2
+\change_unchanged
+
+a2136 2
+
+\change_inserted 0 1283337235
+a2147 2
+\change_unchanged
+
+d2251 1
+a2251 7
+Proposed Solution
+\change_deleted 0 1284423472
+
+\end_layout
+
+\begin_layout Standard
+None.
+d2261 1
+a2261 1
+\change_inserted 0 1284423891
+d2263 1
+a2263 4
+\change_deleted 0 1284423891
+.
+
+\change_inserted 0 1284423901
+a2271 2
+\change_unchanged
+
+a2293 2
+\change_inserted 0 1284423495
+
+a2312 2
+
+\change_inserted 0 1284424201
+d2321 1
+a2321 3
+
+\change_unchanged
+We could solve a small part of the problem by providing read-only transactions.
+a2505 2
+\change_inserted 0 1284423555
+
+a2508 2
+
+\change_inserted 0 1284423617
+a2512 2
+
+\change_inserted 0 1284423719
+a2519 2
+
+\change_inserted 0 1284423864
+a2530 2
+
+\change_inserted 0 1284423850
+a2540 2
+\change_unchanged
+
+@
+
+
+1.9
+log
+@Extension mechanism.
+@
+text
+@d56 2
+a57 2
+\change_inserted 0 1284016854
+9-September
+d479 11
+d1303 1
+a1303 1
+\change_inserted 0 1284016847
+d1310 56
+d1945 1
+a1945 1
+\change_inserted 0 1283310945
+d1956 2
+d2402 2
+d2416 4
+d2421 12
+d2455 2
+d2476 12
+d2673 47
+@
+
+
1.8
log
@Remove bogus footnote