git.ozlabs.org Git - ccan/blob - ccan/tdb2/doc/design.lyx,v

   1 head    1.9;
   2 access;
   3 symbols;
   4 locks; strict;
   5 comment @# @;
   6
   7
   8 1.9
   9 date    2010.09.09.07.25.12;    author rusty;   state Exp;
  10 branches;
  11 next    1.8;
  12
  13 1.8
  14 date    2010.09.02.02.29.05;    author rusty;   state Exp;
  15 branches;
  16 next    1.7;
  17
  18 1.7
  19 date    2010.09.01.10.58.12;    author rusty;   state Exp;
  20 branches;
  21 next    1.6;
  22
  23 1.6
  24 date    2010.08.02.00.21.43;    author rusty;   state Exp;
  25 branches;
  26 next    1.5;
  27
  28 1.5
  29 date    2010.08.02.00.21.16;    author rusty;   state Exp;
  30 branches;
  31 next    1.4;
  32
  33 1.4
  34 date    2010.05.10.13.09.11;    author rusty;   state Exp;
  35 branches;
  36 next    1.3;
  37
  38 1.3
  39 date    2010.05.10.11.58.37;    author rusty;   state Exp;
  40 branches;
  41 next    1.2;
  42
  43 1.2
  44 date    2010.05.10.05.35.13;    author rusty;   state Exp;
  45 branches;
  46 next    1.1;
  47
  48 1.1
  49 date    2010.05.04.02.29.16;    author rusty;   state Exp;
  50 branches;
  51 next    ;
  52
  53
  54 desc
  55 @First draft
  56 @
  57
  58
  59 1.9
  60 log
  61 @Extension mechanism.
  62 @
  63 text
  64 @#LyX 1.6.5 created this file. For more info see http://www.lyx.org/
  65 \lyxformat 345
  66 \begin_document
  67 \begin_header
  68 \textclass article
  69 \use_default_options true
  70 \language english
  71 \inputencoding auto
  72 \font_roman default
  73 \font_sans default
  74 \font_typewriter default
  75 \font_default_family default
  76 \font_sc false
  77 \font_osf false
  78 \font_sf_scale 100
  79 \font_tt_scale 100
  80
  81 \graphics default
  82 \paperfontsize default
  83 \use_hyperref false
  84 \papersize default
  85 \use_geometry false
  86 \use_amsmath 1
  87 \use_esint 1
  88 \cite_engine basic
  89 \use_bibtopic false
  90 \paperorientation portrait
  91 \secnumdepth 3
  92 \tocdepth 3
  93 \paragraph_separation indent
  94 \defskip medskip
  95 \quotes_language english
  96 \papercolumns 1
  97 \papersides 1
  98 \paperpagestyle default
  99 \tracking_changes true
 100 \output_changes true
 101 \author "Rusty Russell,,,"
 102 \author ""
 103 \end_header
 104
 105 \begin_body
 106
 107 \begin_layout Title
 108 TDB2: A Redesigning The Trivial DataBase
 109 \end_layout
 110
 111 \begin_layout Author
 112 Rusty Russell, IBM Corporation
 113 \end_layout
 114
 115 \begin_layout Date
 116
 117 \change_deleted 0 1283307542
 118 26-July
 119 \change_inserted 0 1284016854
 120 9-September
 121 \change_unchanged
 122 -2010
 123 \end_layout
 124
 125 \begin_layout Abstract
 126 The Trivial DataBase on-disk format is 32 bits; with usage cases heading
 127  towards the 4G limit, that must change.
 128  This required breakage provides an opportunity to revisit TDB's other design
 129  decisions and reassess them.
 130 \end_layout
 131
 132 \begin_layout Section
 133 Introduction
 134 \end_layout
 135
 136 \begin_layout Standard
 137 The Trivial DataBase was originally written by Andrew Tridgell as a simple
 138  key/data pair storage system with the same API as dbm, but allowing multiple
 139  readers and writers while being small enough (< 1000 lines of C) to include
 140  in SAMBA.
 141  The simple design created in 1999 has proven surprisingly robust and performant
 142 , used in Samba versions 3 and 4 as well as numerous other projects.
 143  Its useful life was greatly increased by the (backwards-compatible!) addition
 144  of transaction support in 2005.
 145 \end_layout
 146
 147 \begin_layout Standard
 148 The wider variety and greater demands of TDB-using code has lead to some
 149  organic growth of the API, as well as some compromises on the implementation.
 150  None of these, by themselves, are seen as show-stoppers, but the cumulative
 151  effect is to a loss of elegance over the initial, simple TDB implementation.
 152  Here is a table of the approximate number of lines of implementation code
 153  and number of API functions at the end of each year:
 154 \end_layout
 155
 156 \begin_layout Standard
 157 \begin_inset Tabular
 158 <lyxtabular version="3" rows="12" columns="3">
 159 <features>
 160 <column alignment="center" valignment="top" width="0">
 161 <column alignment="center" valignment="top" width="0">
 162 <column alignment="center" valignment="top" width="0">
 163 <row>
 164 <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
 165 \begin_inset Text
 166
 167 \begin_layout Plain Layout
 168 Year End
 169 \end_layout
 170
 171 \end_inset
 172 </cell>
 173 <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
 174 \begin_inset Text
 175
 176 \begin_layout Plain Layout
 177 API Functions
 178 \end_layout
 179
 180 \end_inset
 181 </cell>
 182 <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
 183 \begin_inset Text
 184
 185 \begin_layout Plain Layout
 186 Lines of C Code Implementation
 187 \end_layout
 188
 189 \end_inset
 190 </cell>
 191 </row>
 192 <row>
 193 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 194 \begin_inset Text
 195
 196 \begin_layout Plain Layout
 197 1999
 198 \end_layout
 199
 200 \end_inset
 201 </cell>
 202 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 203 \begin_inset Text
 204
 205 \begin_layout Plain Layout
 206 13
 207 \end_layout
 208
 209 \end_inset
 210 </cell>
 211 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 212 \begin_inset Text
 213
 214 \begin_layout Plain Layout
 215 1195
 216 \end_layout
 217
 218 \end_inset
 219 </cell>
 220 </row>
 221 <row>
 222 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 223 \begin_inset Text
 224
 225 \begin_layout Plain Layout
 226 2000
 227 \end_layout
 228
 229 \end_inset
 230 </cell>
 231 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 232 \begin_inset Text
 233
 234 \begin_layout Plain Layout
 235 24
 236 \end_layout
 237
 238 \end_inset
 239 </cell>
 240 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 241 \begin_inset Text
 242
 243 \begin_layout Plain Layout
 244 1725
 245 \end_layout
 246
 247 \end_inset
 248 </cell>
 249 </row>
 250 <row>
 251 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 252 \begin_inset Text
 253
 254 \begin_layout Plain Layout
 255 2001
 256 \end_layout
 257
 258 \end_inset
 259 </cell>
 260 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 261 \begin_inset Text
 262
 263 \begin_layout Plain Layout
 264 32
 265 \end_layout
 266
 267 \end_inset
 268 </cell>
 269 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 270 \begin_inset Text
 271
 272 \begin_layout Plain Layout
 273 2228
 274 \end_layout
 275
 276 \end_inset
 277 </cell>
 278 </row>
 279 <row>
 280 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 281 \begin_inset Text
 282
 283 \begin_layout Plain Layout
 284 2002
 285 \end_layout
 286
 287 \end_inset
 288 </cell>
 289 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 290 \begin_inset Text
 291
 292 \begin_layout Plain Layout
 293 35
 294 \end_layout
 295
 296 \end_inset
 297 </cell>
 298 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 299 \begin_inset Text
 300
 301 \begin_layout Plain Layout
 302 2481
 303 \end_layout
 304
 305 \end_inset
 306 </cell>
 307 </row>
 308 <row>
 309 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 310 \begin_inset Text
 311
 312 \begin_layout Plain Layout
 313 2003
 314 \end_layout
 315
 316 \end_inset
 317 </cell>
 318 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 319 \begin_inset Text
 320
 321 \begin_layout Plain Layout
 322 35
 323 \end_layout
 324
 325 \end_inset
 326 </cell>
 327 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 328 \begin_inset Text
 329
 330 \begin_layout Plain Layout
 331 2552
 332 \end_layout
 333
 334 \end_inset
 335 </cell>
 336 </row>
 337 <row>
 338 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 339 \begin_inset Text
 340
 341 \begin_layout Plain Layout
 342 2004
 343 \end_layout
 344
 345 \end_inset
 346 </cell>
 347 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 348 \begin_inset Text
 349
 350 \begin_layout Plain Layout
 351 40
 352 \end_layout
 353
 354 \end_inset
 355 </cell>
 356 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 357 \begin_inset Text
 358
 359 \begin_layout Plain Layout
 360 2584
 361 \end_layout
 362
 363 \end_inset
 364 </cell>
 365 </row>
 366 <row>
 367 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 368 \begin_inset Text
 369
 370 \begin_layout Plain Layout
 371 2005
 372 \end_layout
 373
 374 \end_inset
 375 </cell>
 376 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 377 \begin_inset Text
 378
 379 \begin_layout Plain Layout
 380 38
 381 \end_layout
 382
 383 \end_inset
 384 </cell>
 385 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 386 \begin_inset Text
 387
 388 \begin_layout Plain Layout
 389 2647
 390 \end_layout
 391
 392 \end_inset
 393 </cell>
 394 </row>
 395 <row>
 396 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 397 \begin_inset Text
 398
 399 \begin_layout Plain Layout
 400 2006
 401 \end_layout
 402
 403 \end_inset
 404 </cell>
 405 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 406 \begin_inset Text
 407
 408 \begin_layout Plain Layout
 409 52
 410 \end_layout
 411
 412 \end_inset
 413 </cell>
 414 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 415 \begin_inset Text
 416
 417 \begin_layout Plain Layout
 418 3754
 419 \end_layout
 420
 421 \end_inset
 422 </cell>
 423 </row>
 424 <row>
 425 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 426 \begin_inset Text
 427
 428 \begin_layout Plain Layout
 429 2007
 430 \end_layout
 431
 432 \end_inset
 433 </cell>
 434 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 435 \begin_inset Text
 436
 437 \begin_layout Plain Layout
 438 66
 439 \end_layout
 440
 441 \end_inset
 442 </cell>
 443 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 444 \begin_inset Text
 445
 446 \begin_layout Plain Layout
 447 4398
 448 \end_layout
 449
 450 \end_inset
 451 </cell>
 452 </row>
 453 <row>
 454 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 455 \begin_inset Text
 456
 457 \begin_layout Plain Layout
 458 2008
 459 \end_layout
 460
 461 \end_inset
 462 </cell>
 463 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 464 \begin_inset Text
 465
 466 \begin_layout Plain Layout
 467 71
 468 \end_layout
 469
 470 \end_inset
 471 </cell>
 472 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 473 \begin_inset Text
 474
 475 \begin_layout Plain Layout
 476 4768
 477 \end_layout
 478
 479 \end_inset
 480 </cell>
 481 </row>
 482 <row>
 483 <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
 484 \begin_inset Text
 485
 486 \begin_layout Plain Layout
 487 2009
 488 \end_layout
 489
 490 \end_inset
 491 </cell>
 492 <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
 493 \begin_inset Text
 494
 495 \begin_layout Plain Layout
 496 73
 497 \end_layout
 498
 499 \end_inset
 500 </cell>
 501 <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
 502 \begin_inset Text
 503
 504 \begin_layout Plain Layout
 505 5715
 506 \end_layout
 507
 508 \end_inset
 509 </cell>
 510 </row>
 511 </lyxtabular>
 512
 513 \end_inset
 514
 515
 516 \end_layout
 517
 518 \begin_layout Standard
 519 This review is an attempt to catalog and address all the known issues with
 520  TDB and create solutions which address the problems without significantly
 521  increasing complexity; all involved are far too aware of the dangers of
 522  second system syndrome in rewriting a successful project like this.
 523 \end_layout
 524
 525 \begin_layout Section
 526 API Issues
 527 \end_layout
 528
 529 \begin_layout Subsection
 530 tdb_open_ex Is Not Expandable
 531 \end_layout
 532
 533 \begin_layout Standard
 534 The tdb_open() call was expanded to tdb_open_ex(), which added an optional
 535  hashing function and an optional logging function argument.
 536  Additional arguments to open would require the introduction of a tdb_open_ex2
 537  call etc.
 538 \end_layout
 539
 540 \begin_layout Subsubsection
 541 Proposed Solution
 542 \end_layout
 543
 544 \begin_layout Standard
 545 tdb_open() will take a linked-list of attributes:
 546 \end_layout
 547
 548 \begin_layout LyX-Code
 549 enum tdb_attribute {
 550 \end_layout
 551
 552 \begin_layout LyX-Code
 553     TDB_ATTRIBUTE_LOG = 0,
 554 \end_layout
 555
 556 \begin_layout LyX-Code
 557     TDB_ATTRIBUTE_HASH = 1
 558 \end_layout
 559
 560 \begin_layout LyX-Code
 561 };
 562 \end_layout
 563
 564 \begin_layout LyX-Code
 565 struct tdb_attribute_base {
 566 \end_layout
 567
 568 \begin_layout LyX-Code
 569     enum tdb_attribute attr;
 570 \end_layout
 571
 572 \begin_layout LyX-Code
 573     union tdb_attribute *next;
 574 \end_layout
 575
 576 \begin_layout LyX-Code
 577 };
 578 \end_layout
 579
 580 \begin_layout LyX-Code
 581 struct tdb_attribute_log {
 582 \end_layout
 583
 584 \begin_layout LyX-Code
 585     struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
 586 \end_layout
 587
 588 \begin_layout LyX-Code
 589     tdb_log_func log_fn;
 590 \end_layout
 591
 592 \begin_layout LyX-Code
 593     void *log_private;
 594 \end_layout
 595
 596 \begin_layout LyX-Code
 597 };
 598 \end_layout
 599
 600 \begin_layout LyX-Code
 601 struct tdb_attribute_hash {
 602 \end_layout
 603
 604 \begin_layout LyX-Code
 605     struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
 606 \end_layout
 607
 608 \begin_layout LyX-Code
 609     tdb_hash_func hash_fn;
 610 \end_layout
 611
 612 \begin_layout LyX-Code
 613     void *hash_private;
 614 \end_layout
 615
 616 \begin_layout LyX-Code
 617 };
 618 \end_layout
 619
 620 \begin_layout LyX-Code
 621 union tdb_attribute {
 622 \end_layout
 623
 624 \begin_layout LyX-Code
 625     struct tdb_attribute_base base;
 626 \end_layout
 627
 628 \begin_layout LyX-Code
 629     struct tdb_attribute_log log;
 630 \end_layout
 631
 632 \begin_layout LyX-Code
 633     struct tdb_attribute_hash hash;
 634 \end_layout
 635
 636 \begin_layout LyX-Code
 637 };
 638 \end_layout
 639
 640 \begin_layout Standard
 641 This allows future attributes to be added, even if this expands the size
 642  of the union.
 643 \end_layout
 644
 645 \begin_layout Subsection
 646 tdb_traverse Makes Impossible Guarantees
 647 \end_layout
 648
 649 \begin_layout Standard
 650 tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it
 651  was thought that it was important to guarantee that all records which exist
 652  at the start and end of the traversal would be included, and no record
 653  would be included twice.
 654 \end_layout
 655
 656 \begin_layout Standard
 657 This adds complexity (see
 658 \begin_inset CommandInset ref
 659 LatexCommand ref
 660 reference "Reliable-Traversal-Adds"
 661
 662 \end_inset
 663
 664 ) and does not work anyway for records which are altered (in particular,
 665  those which are expanded may be effectively deleted and re-added behind
 666  the traversal).
 667 \end_layout
 668
 669 \begin_layout Subsubsection
 670 \begin_inset CommandInset label
 671 LatexCommand label
 672 name "traverse-Proposed-Solution"
 673
 674 \end_inset
 675
 676 Proposed Solution
 677 \end_layout
 678
 679 \begin_layout Standard
 680 Abandon the guarantee.
 681  You will see every record if no changes occur during your traversal, otherwise
 682  you will see some subset.
 683  You can prevent changes by using a transaction or the locking API.
 684 \end_layout
 685
 686 \begin_layout Subsection
 687 Nesting of Transactions Is Fraught
 688 \end_layout
 689
 690 \begin_layout Standard
 691 TDB has alternated between allowing nested transactions and not allowing
 692  them.
 693  Various paths in the Samba codebase assume that transactions will nest,
 694  and in a sense they can: the operation is only committed to disk when the
 695  outer transaction is committed.
 696  There are two problems, however:
 697 \end_layout
 698
 699 \begin_layout Enumerate
 700 Canceling the inner transaction will cause the outer transaction commit
 701  to fail, and will not undo any operations since the inner transaction began.
 702  This problem is soluble with some additional internal code.
 703 \end_layout
 704
 705 \begin_layout Enumerate
 706 An inner transaction commit can be cancelled by the outer transaction.
 707  This is desirable in the way which Samba's database initialization code
 708  uses transactions, but could be a surprise to any users expecting a successful
 709  transaction commit to expose changes to others.
 710 \end_layout
 711
 712 \begin_layout Standard
 713 The current solution is to specify the behavior at tdb_open(), with the
 714  default currently that nested transactions are allowed.
 715  This flag can also be changed at runtime.
 716 \end_layout
 717
 718 \begin_layout Subsubsection
 719 Proposed Solution
 720 \end_layout
 721
 722 \begin_layout Standard
 723 Given the usage patterns, it seems that the
 724 \begin_inset Quotes eld
 725 \end_inset
 726
 727 least-surprise
 728 \begin_inset Quotes erd
 729 \end_inset
 730
 731  behavior of disallowing nested transactions should become the default.
 732  Additionally, it seems the outer transaction is the only code which knows
 733  whether inner transactions should be allowed, so a flag to indicate this
 734  could be added to tdb_transaction_start.
 735  However, this behavior can be simulated with a wrapper which uses tdb_add_flags
 736 () and tdb_remove_flags(), so the API should not be expanded for this relatively
 737 -obscure case.
 738 \end_layout
 739
 740 \begin_layout Subsection
 741 Incorrect Hash Function is Not Detected
 742 \end_layout
 743
 744 \begin_layout Standard
 745 tdb_open_ex() allows the calling code to specify a different hash function
 746  to use, but does not check that all other processes accessing this tdb
 747  are using the same hash function.
 748  The result is that records are missing from tdb_fetch().
 749 \end_layout
 750
 751 \begin_layout Subsubsection
 752 Proposed Solution
 753 \end_layout
 754
 755 \begin_layout Standard
 756 The header should contain an example hash result (eg.
 757  the hash of 0xdeadbeef), and tdb_open_ex() should check that the given
 758  hash function produces the same answer, or fail the tdb_open call.
 759 \end_layout
 760
 761 \begin_layout Subsection
 762 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
 763 \end_layout
 764
 765 \begin_layout Standard
 766 In response to scalability issues with the free list (
 767 \begin_inset CommandInset ref
 768 LatexCommand ref
 769 reference "TDB-Freelist-Is"
 770
 771 \end_inset
 772
 773 ) two API workarounds have been incorporated in TDB: tdb_set_max_dead()
 774  and the TDB_VOLATILE flag to tdb_open.
 775  The latter actually calls the former with an argument of
 776 \begin_inset Quotes eld
 777 \end_inset
 778
 779 5
 780 \begin_inset Quotes erd
 781 \end_inset
 782
 783 .
 784 \end_layout
 785
 786 \begin_layout Standard
 787 This code allows deleted records to accumulate without putting them in the
 788  free list.
 789  On delete we iterate through each chain and free them in a batch if there
 790  are more than max_dead entries.
 791  These are never otherwise recycled except as a side-effect of a tdb_repack.
 792 \end_layout
 793
 794 \begin_layout Subsubsection
 795 Proposed Solution
 796 \end_layout
 797
 798 \begin_layout Standard
 799 With the scalability problems of the freelist solved, this API can be removed.
 800  The TDB_VOLATILE flag may still be useful as a hint that store and delete
 801  of records will be at least as common as fetch in order to allow some internal
 802  tuning, but initially will become a no-op.
 803 \end_layout
 804
 805 \begin_layout Subsection
 806 \begin_inset CommandInset label
 807 LatexCommand label
 808 name "TDB-Files-Cannot"
 809
 810 \end_inset
 811
 812 TDB Files Cannot Be Opened Multiple Times In The Same Process
 813 \end_layout
 814
 815 \begin_layout Standard
 816 No process can open the same TDB twice; we check and disallow it.
 817  This is an unfortunate side-effect of fcntl locks, which operate on a per-file
 818  rather than per-file-descriptor basis, and do not nest.
 819  Thus, closing any file descriptor on a file clears all the locks obtained
 820  by this process, even if they were placed using a different file descriptor!
 821 \end_layout
 822
 823 \begin_layout Standard
 824 Note that even if this were solved, deadlock could occur if operations were
 825  nested: this is a more manageable programming error in most cases.
 826 \end_layout
 827
 828 \begin_layout Subsubsection
 829 Proposed Solution
 830 \end_layout
 831
 832 \begin_layout Standard
 833 We could lobby POSIX to fix the perverse rules, or at least lobby Linux
 834  to violate them so that the most common implementation does not have this
 835  restriction.
 836  This would be a generally good idea for other fcntl lock users.
 837 \end_layout
 838
 839 \begin_layout Standard
 840 Samba uses a wrapper which hands out the same tdb_context to multiple callers
 841  if this happens, and does simple reference counting.
 842  We should do this inside the tdb library, which already emulates lock nesting
 843  internally; it would need to recognize when deadlock occurs within a single
 844  process.
 845  This would create a new failure mode for tdb operations (while we currently
 846  handle locking failures, they are impossible in normal use and a process
 847  encountering them can do little but give up).
 848 \end_layout
 849
 850 \begin_layout Standard
 851 I do not see benefit in an additional tdb_open flag to indicate whether
 852  re-opening is allowed, as though there may be some benefit to adding a
 853  call to detect when a tdb_context is shared, to allow other to create such
 854  an API.
 855 \end_layout
 856
 857 \begin_layout Subsection
 858 TDB API Is Not POSIX Thread-safe
 859 \end_layout
 860
 861 \begin_layout Standard
 862 The TDB API uses an error code which can be queried after an operation to
 863  determine what went wrong.
 864  This programming model does not work with threads, unless specific additional
 865  guarantees are given by the implementation.
 866  In addition, even otherwise-independent threads cannot open the same TDB
 867  (as in
 868 \begin_inset CommandInset ref
 869 LatexCommand ref
 870 reference "TDB-Files-Cannot"
 871
 872 \end_inset
 873
 874 ).
 875 \end_layout
 876
 877 \begin_layout Subsubsection
 878 Proposed Solution
 879 \end_layout
 880
 881 \begin_layout Standard
 882 Reachitecting the API to include a tdb_errcode pointer would be a great
 883  deal of churn; we are better to guarantee that the tdb_errcode is per-thread
 884  so the current programming model can be maintained.
 885 \end_layout
 886
 887 \begin_layout Standard
 888 This requires dynamic per-thread allocations, which is awkward with POSIX
 889  threads (pthread_key_create space is limited and we cannot simply allocate
 890  a key for every TDB).
 891 \end_layout
 892
 893 \begin_layout Standard
 894 Internal locking is required to make sure that fcntl locks do not overlap
 895  between threads, and also that the global list of tdbs is maintained.
 896 \end_layout
 897
 898 \begin_layout Standard
 899 The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
 900  version of the library, and otherwise no overhead will exist.
 901
 902 \change_inserted 0 1284016998
 903  Alternatively, a hooking mechanism similar to that proposed for
 904 \begin_inset CommandInset ref
 905 LatexCommand ref
 906 reference "Proposed-Solution-locking-hook"
 907
 908 \end_inset
 909
 910  could be used to enable pthread locking at runtime.
 911 \change_unchanged
 912
 913 \end_layout
 914
 915 \begin_layout Subsection
 916 *_nonblock Functions And *_mark Functions Expose Implementation
 917 \end_layout
 918
 919 \begin_layout Standard
 920 CTDB
 921 \begin_inset Foot
 922 status collapsed
 923
 924 \begin_layout Plain Layout
 925 Clustered TDB, see http://ctdb.samba.org
 926 \end_layout
 927
 928 \end_inset
 929
 930  wishes to operate on TDB in a non-blocking manner.
 931  This is currently done as follows:
 932 \end_layout
 933
 934 \begin_layout Enumerate
 935 Call the _nonblock variant of an API function (eg.
 936  tdb_lockall_nonblock).
 937  If this fails:
 938 \end_layout
 939
 940 \begin_layout Enumerate
 941 Fork a child process, and wait for it to call the normal variant (eg.
 942  tdb_lockall).
 943 \end_layout
 944
 945 \begin_layout Enumerate
 946 If the child succeeds, call the _mark variant to indicate we already have
 947  the locks (eg.
 948  tdb_lockall_mark).
 949 \end_layout
 950
 951 \begin_layout Enumerate
 952 Upon completion, tell the child to release the locks (eg.
 953  tdb_unlockall).
 954 \end_layout
 955
 956 \begin_layout Enumerate
 957 Indicate to tdb that it should consider the locks removed (eg.
 958  tdb_unlockall_mark).
 959 \end_layout
 960
 961 \begin_layout Standard
 962 There are several issues with this approach.
 963  Firstly, adding two new variants of each function clutters the API for
 964  an obscure use, and so not all functions have three variants.
 965  Secondly, it assumes that all paths of the functions ask for the same locks,
 966  otherwise the parent process will have to get a lock which the child doesn't
 967  have under some circumstances.
 968  I don't believe this is currently the case, but it constrains the implementatio
 969 n.
 970
 971 \end_layout
 972
 973 \begin_layout Subsubsection
 974 \begin_inset CommandInset label
 975 LatexCommand label
 976 name "Proposed-Solution-locking-hook"
 977
 978 \end_inset
 979
 980 Proposed Solution
 981 \end_layout
 982
 983 \begin_layout Standard
 984 Implement a hook for locking methods, so that the caller can control the
 985  calls to create and remove fcntl locks.
 986  In this scenario, ctdbd would operate as follows:
 987 \end_layout
 988
 989 \begin_layout Enumerate
 990 Call the normal API function, eg tdb_lockall().
 991 \end_layout
 992
 993 \begin_layout Enumerate
 994 When the lock callback comes in, check if the child has the lock.
 995  Initially, this is always false.
 996  If so, return 0.
 997  Otherwise, try to obtain it in non-blocking mode.
 998  If that fails, return EWOULDBLOCK.
 999 \end_layout
1000
1001 \begin_layout Enumerate
1002 Release locks in the unlock callback as normal.
1003 \end_layout
1004
1005 \begin_layout Enumerate
1006 If tdb_lockall() fails, see if we recorded a lock failure; if so, call the
1007  child to repeat the operation.
1008 \end_layout
1009
1010 \begin_layout Enumerate
1011 The child records what locks it obtains, and returns that information to
1012  the parent.
1013 \end_layout
1014
1015 \begin_layout Enumerate
1016 When the child has succeeded, goto 1.
1017 \end_layout
1018
1019 \begin_layout Standard
1020 This is flexible enough to handle any potential locking scenario, even when
1021  lock requirements change.
1022  It can be optimized so that the parent does not release locks, just tells
1023  the child which locks it doesn't need to obtain.
1024 \end_layout
1025
1026 \begin_layout Standard
1027 It also keeps the complexity out of the API, and in ctdbd where it is needed.
1028 \end_layout
1029
1030 \begin_layout Subsection
1031 tdb_chainlock Functions Expose Implementation
1032 \end_layout
1033
1034 \begin_layout Standard
1035 tdb_chainlock locks some number of records, including the record indicated
1036  by the given key.
1037  This gave atomicity guarantees; no-one can start a transaction, alter,
1038  read or delete that key while the lock is held.
1039 \end_layout
1040
1041 \begin_layout Standard
1042 It also makes the same guarantee for any other key in the chain, which is
1043  an internal implementation detail and potentially a cause for deadlock.
1044 \end_layout
1045
1046 \begin_layout Subsubsection
1047 Proposed Solution
1048 \end_layout
1049
1050 \begin_layout Standard
1051 None.
1052  It would be nice to have an explicit single entry lock which effected no
1053  other keys.
1054  Unfortunately, this won't work for an entry which doesn't exist.
1055  Thus while chainlock may be implemented more efficiently for the existing
1056  case, it will still have overlap issues with the non-existing case.
1057  So it is best to keep the current (lack of) guarantee about which records
1058  will be effected to avoid constraining our implementation.
1059 \end_layout
1060
1061 \begin_layout Subsection
1062 Signal Handling is Not Race-Free
1063 \end_layout
1064
1065 \begin_layout Standard
1066 The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate
1067  that the tdb locking code should return with a failure, rather than trying
1068  again when a signal is received (and errno == EAGAIN).
1069  This is usually used to implement timeouts.
1070 \end_layout
1071
1072 \begin_layout Standard
1073 Unfortunately, this does not work in the case where the signal is received
1074  before the tdb code enters the fcntl() call to place the lock: the code
1075  will sleep within the fcntl() code, unaware that the signal wants it to
1076  exit.
1077  In the case of long timeouts, this does not happen in practice.
1078 \end_layout
1079
1080 \begin_layout Subsubsection
1081 Proposed Solution
1082 \end_layout
1083
1084 \begin_layout Standard
1085 The locking hooks proposed in
1086 \begin_inset CommandInset ref
1087 LatexCommand ref
1088 reference "Proposed-Solution-locking-hook"
1089
1090 \end_inset
1091
1092  would allow the user to decide on whether to fail the lock acquisition
1093  on a signal.
1094  This allows the caller to choose their own compromise: they could narrow
1095  the race by checking immediately before the fcntl call.
1096 \begin_inset Foot
1097 status collapsed
1098
1099 \begin_layout Plain Layout
1100 It may be possible to make this race-free in some implementations by having
1101  the signal handler alter the struct flock to make it invalid.
1102  This will cause the fcntl() lock call to fail with EINVAL if the signal
1103  occurs before the kernel is entered, otherwise EAGAIN.
1104 \end_layout
1105
1106 \end_inset
1107
1108
1109 \end_layout
1110
1111 \begin_layout Subsection
1112 The API Uses Gratuitous Typedefs, Capitals
1113 \end_layout
1114
1115 \begin_layout Standard
1116 typedefs are useful for providing source compatibility when types can differ
1117  across implementations, or arguably in the case of function pointer definitions
1118  which are hard for humans to parse.
1119  Otherwise it is simply obfuscation and pollutes the namespace.
1120 \end_layout
1121
1122 \begin_layout Standard
1123 Capitalization is usually reserved for compile-time constants and macros.
1124 \end_layout
1125
1126 \begin_layout Description
1127 TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the
1128  definition isn't visible to the API user anyway.
1129 \end_layout
1130
1131 \begin_layout Description
1132 TDB_DATA There is no reason to use this over struct TDB_DATA; the struct
1133  needs to be understood by the API user.
1134 \end_layout
1135
1136 \begin_layout Description
1137 struct
1138 \begin_inset space ~
1139 \end_inset
1140
1141 TDB_DATA This would normally be called 'struct tdb_data'.
1142 \end_layout
1143
1144 \begin_layout Description
1145 enum
1146 \begin_inset space ~
1147 \end_inset
1148
1149 TDB_ERROR Similarly, this would normally be enum tdb_error.
1150 \end_layout
1151
1152 \begin_layout Subsubsection
1153 Proposed Solution
1154 \end_layout
1155
1156 \begin_layout Standard
1157 None.
1158  Introducing lower case variants would please pedants like myself, but if
1159  it were done the existing ones should be kept.
1160  There is little point forcing a purely cosmetic change upon tdb users.
1161 \end_layout
1162
1163 \begin_layout Subsection
1164 \begin_inset CommandInset label
1165 LatexCommand label
1166 name "tdb_log_func-Doesnt-Take"
1167
1168 \end_inset
1169
1170 tdb_log_func Doesn't Take The Private Pointer
1171 \end_layout
1172
1173 \begin_layout Standard
1174 For API compatibility reasons, the logging function needs to call tdb_get_loggin
1175 g_private() to retrieve the pointer registered by the tdb_open_ex for logging.
1176 \end_layout
1177
1178 \begin_layout Subsubsection
1179 Proposed Solution
1180 \end_layout
1181
1182 \begin_layout Standard
1183 It should simply take an extra argument, since we are prepared to break
1184  the API/ABI.
1185 \end_layout
1186
1187 \begin_layout Subsection
1188 Various Callback Functions Are Not Typesafe
1189 \end_layout
1190
1191 \begin_layout Standard
1192 The callback functions in tdb_set_logging_function (after
1193 \begin_inset CommandInset ref
1194 LatexCommand ref
1195 reference "tdb_log_func-Doesnt-Take"
1196
1197 \end_inset
1198
1199  is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check
1200  all take void * and must internally convert it to the argument type they
1201  were expecting.
1202 \end_layout
1203
1204 \begin_layout Standard
1205 If this type changes, the compiler will not produce warnings on the callers,
1206  since it only sees void *.
1207 \end_layout
1208
1209 \begin_layout Subsubsection
1210 Proposed Solution
1211 \end_layout
1212
1213 \begin_layout Standard
1214 With careful use of macros, we can create callback functions which give
1215  a warning when used on gcc and the types of the callback and its private
1216  argument differ.
1217  Unsupported compilers will not give a warning, which is no worse than now.
1218  In addition, the callbacks become clearer, as they need not use void *
1219  for their parameter.
1220 \end_layout
1221
1222 \begin_layout Standard
1223 See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
1224 \end_layout
1225
1226 \begin_layout Subsection
1227 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
1228 \end_layout
1229
1230 \begin_layout Standard
1231 The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should
1232  be cleared if the caller discovers it is the only process with the TDB
1233  open.
1234  However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not
1235  be detected, so will have the TDB erased underneath them (usually resulting
1236  in a crash).
1237 \end_layout
1238
1239 \begin_layout Standard
1240 There is a similar issue on fork(); if the parent exits (or otherwise closes
1241  the tdb) before the child calls tdb_reopen_all() to establish the lock
1242  used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener
1243  at that moment will believe it alone has opened the TDB and will erase
1244  it.
1245 \end_layout
1246
1247 \begin_layout Subsubsection
1248 Proposed Solution
1249 \end_layout
1250
1251 \begin_layout Standard
1252 Remove TDB_CLEAR_IF_FIRST.
1253  Other workarounds are possible, but see
1254 \begin_inset CommandInset ref
1255 LatexCommand ref
1256 reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
1257
1258 \end_inset
1259
1260 .
1261 \change_inserted 0 1284015637
1262
1263 \end_layout
1264
1265 \begin_layout Subsection
1266
1267 \change_inserted 0 1284015716
1268 Extending The Header Is Difficult
1269 \end_layout
1270
1271 \begin_layout Standard
1272
1273 \change_inserted 0 1284015906
1274 We have reserved (zeroed) words in the TDB header, which can be used for
1275  future features.
1276  If the future features are compulsory, the version number must be updated
1277  to prevent old code from accessing the database.
1278  But if the future feature is optional, we have no way of telling if older
1279  code is accessing the database or not.
1280 \end_layout
1281
1282 \begin_layout Subsubsection
1283
1284 \change_inserted 0 1284015637
1285 Proposed Solution
1286 \end_layout
1287
1288 \begin_layout Standard
1289
1290 \change_inserted 0 1284016114
1291 The header should contain a
1292 \begin_inset Quotes eld
1293 \end_inset
1294
1295 format variant
1296 \begin_inset Quotes erd
1297 \end_inset
1298
1299  value (64-bit).
1300  This is divided into two 32-bit parts:
1301 \end_layout
1302
1303 \begin_layout Enumerate
1304
1305 \change_inserted 0 1284016149
1306 The lower part reflects the format variant understood by code accessing
1307  the database.
1308 \end_layout
1309
1310 \begin_layout Enumerate
1311
1312 \change_inserted 0 1284016639
1313 The upper part reflects the format variant you must understand to write
1314  to the database (otherwise you can only open for reading).
1315 \end_layout
1316
1317 \begin_layout Standard
1318
1319 \change_inserted 0 1284016821
1320 The latter field can only be written at creation time, the former should
1321  be written under the OPEN_LOCK when opening the database for writing, if
1322  the variant of the code is lower than the current lowest variant.
1323 \end_layout
1324
1325 \begin_layout Standard
1326
1327 \change_inserted 0 1284016803
1328 This should allow backwards-compatible features to be added, and detection
1329  if older code (which doesn't understand the feature) writes to the database.
1330 \change_deleted 0 1284016101
1331
1332 \end_layout
1333
1334 \begin_layout Subsection
1335
1336 \change_inserted 0 1284015634
1337 Record Headers Are Not Expandible
1338 \end_layout
1339
1340 \begin_layout Standard
1341
1342 \change_inserted 0 1284015634
1343 If we later want to add (say) checksums on keys and data, it would require
1344  another format change, which we'd like to avoid.
1345 \end_layout
1346
1347 \begin_layout Subsubsection
1348
1349 \change_inserted 0 1284015634
1350 Proposed Solution
1351 \end_layout
1352
1353 \begin_layout Standard
1354
1355 \change_inserted 0 1284016847
1356 We often have extra padding at the tail of a record.
1357  If we ensure that the first byte (if any) of this padding is zero, we will
1358  have a way for future changes to detect code which doesn't understand a
1359  new format: the new code would write (say) a 1 at the tail, and thus if
1360  there is no tail or the first byte is 0, we would know the extension is
1361  not present on that record.
1362 \change_unchanged
1363
1364 \end_layout
1365
1366 \begin_layout Section
1367 Performance And Scalability Issues
1368 \end_layout
1369
1370 \begin_layout Subsection
1371 \begin_inset CommandInset label
1372 LatexCommand label
1373 name "TDB_CLEAR_IF_FIRST-Imposes-Performance"
1374
1375 \end_inset
1376
1377 TDB_CLEAR_IF_FIRST Imposes Performance Penalty
1378 \end_layout
1379
1380 \begin_layout Standard
1381 When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset
1382  4 (aka.
1383  the ACTIVE_LOCK).
1384  While these locks never conflict in normal tdb usage, they do add substantial
1385  overhead for most fcntl lock implementations when the kernel scans to detect
1386  if a lock conflict exists.
1387  This is often a single linked list, making the time to acquire and release
1388  a fcntl lock O(N) where N is the number of processes with the TDB open,
1389  not the number actually doing work.
1390 \end_layout
1391
1392 \begin_layout Standard
1393 In a Samba server it is common to have huge numbers of clients sitting idle,
1394  and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag.
1395 \begin_inset Foot
1396 status collapsed
1397
1398 \begin_layout Plain Layout
1399 There is a flag to tdb_reopen_all() which is used for this optimization:
1400  if the parent process will outlive the child, the child does not need the
1401  ACTIVE_LOCK.
1402  This is a workaround for this very performance issue.
1403 \end_layout
1404
1405 \end_inset
1406
1407
1408 \end_layout
1409
1410 \begin_layout Subsubsection
1411 Proposed Solution
1412 \end_layout
1413
1414 \begin_layout Standard
1415 Remove the flag.
1416  It was a neat idea, but even trivial servers tend to know when they are
1417  initializing for the first time and can simply unlink the old tdb at that
1418  point.
1419 \end_layout
1420
1421 \begin_layout Subsection
1422 TDB Files Have a 4G Limit
1423 \end_layout
1424
1425 \begin_layout Standard
1426 This seems to be becoming an issue (so much for
1427 \begin_inset Quotes eld
1428 \end_inset
1429
1430 trivial
1431 \begin_inset Quotes erd
1432 \end_inset
1433
1434 !), particularly for ldb.
1435 \end_layout
1436
1437 \begin_layout Subsubsection
1438 Proposed Solution
1439 \end_layout
1440
1441 \begin_layout Standard
1442 A new, incompatible TDB format which uses 64 bit offsets internally rather
1443  than 32 bit as now.
1444  For simplicity of endian conversion (which TDB does on the fly if required),
1445  all values will be 64 bit on disk.
1446  In practice, some upper bits may be used for other purposes, but at least
1447  56 bits will be available for file offsets.
1448 \end_layout
1449
1450 \begin_layout Standard
1451 tdb_open() will automatically detect the old version, and even create them
1452  if TDB_VERSION6 is specified to tdb_open.
1453 \end_layout
1454
1455 \begin_layout Standard
1456 32 bit processes will still be able to access TDBs larger than 4G (assuming
1457  that their off_t allows them to seek to 64 bits), they will gracefully
1458  fall back as they fail to mmap.
1459  This can happen already with large TDBs.
1460 \end_layout
1461
1462 \begin_layout Standard
1463 Old versions of tdb will fail to open the new TDB files (since 28 August
1464  2009, commit 398d0c29290: prior to that any unrecognized file format would
1465  be erased and initialized as a fresh tdb!)
1466 \end_layout
1467
1468 \begin_layout Subsection
1469 TDB Records Have a 4G Limit
1470 \end_layout
1471
1472 \begin_layout Standard
1473 This has not been a reported problem, and the API uses size_t which can
1474  be 64 bit on 64 bit platforms.
1475  However, other limits may have made such an issue moot.
1476 \end_layout
1477
1478 \begin_layout Subsubsection
1479 Proposed Solution
1480 \end_layout
1481
1482 \begin_layout Standard
1483 Record sizes will be 64 bit, with an error returned on 32 bit platforms
1484  which try to access such records (the current implementation would return
1485  TDB_ERR_OOM in a similar case).
1486  It seems unlikely that 32 bit keys will be a limitation, so the implementation
1487  may not support this (see
1488 \begin_inset CommandInset ref
1489 LatexCommand ref
1490 reference "sub:Records-Incur-A"
1491
1492 \end_inset
1493
1494 ).
1495 \end_layout
1496
1497 \begin_layout Subsection
1498 Hash Size Is Determined At TDB Creation Time
1499 \end_layout
1500
1501 \begin_layout Standard
1502 TDB contains a number of hash chains in the header; the number is specified
1503  at creation time, and defaults to 131.
1504  This is such a bottleneck on large databases (as each hash chain gets quite
1505  long), that LDB uses 10,000 for this hash.
1506  In general it is impossible to know what the 'right' answer is at database
1507  creation time.
1508 \end_layout
1509
1510 \begin_layout Subsubsection
1511
1512 \change_inserted 0 1283336713
1513 \begin_inset CommandInset label
1514 LatexCommand label
1515 name "sub:Hash-Size-Solution"
1516
1517 \end_inset
1518
1519
1520 \change_unchanged
1521 Proposed Solution
1522 \end_layout
1523
1524 \begin_layout Standard
1525 After comprehensive performance testing on various scalable hash variants
1526 \begin_inset Foot
1527 status collapsed
1528
1529 \begin_layout Plain Layout
1530 http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoying
1531  because I was previously convinced that an expanding tree of hashes would
1532  be very close to optimal.
1533 \end_layout
1534
1535 \end_inset
1536
1537 , it became clear that it is hard to beat a straight linear hash table which
1538  doubles in size when it reaches saturation.
1539
1540 \change_deleted 0 1283307675
1541 There are three details which become important:
1542 \end_layout
1543
1544 \begin_layout Enumerate
1545
1546 \change_deleted 0 1283307675
1547 On encountering a full bucket, we use the next bucket.
1548 \end_layout
1549
1550 \begin_layout Enumerate
1551
1552 \change_deleted 0 1283307675
1553 Extra hash bits are stored with the offset, to reduce comparisons.
1554 \end_layout
1555
1556 \begin_layout Enumerate
1557
1558 \change_deleted 0 1283307675
1559 A marker entry is used on deleting an entry.
1560 \end_layout
1561
1562 \begin_layout Standard
1563
1564 \change_deleted 0 1283307675
1565 The doubling of the table must be done under a transaction; we will not
1566  reduce it on deletion, so it will be an unusual case.
1567  It will either be placed at the head (other entries will be moved out the
1568  way so we can expand).
1569  We could have a pointer in the header to the current hashtable location,
1570  but that pointer would have to be read frequently to check for hashtable
1571  moves.
1572 \end_layout
1573
1574 \begin_layout Standard
1575
1576 \change_deleted 0 1283307675
1577 The locking for this is slightly more complex than the chained case; we
1578  currently have one lock per bucket, and that means we would need to expand
1579  the lock if we overflow to the next bucket.
1580  The frequency of such collisions will effect our locking heuristics: we
1581  can always lock more buckets than we need.
1582 \end_layout
1583
1584 \begin_layout Standard
1585
1586 \change_deleted 0 1283307675
1587 One possible optimization is to only re-check the hash size on an insert
1588  or a lookup miss.
1589
1590 \change_inserted 0 1283307770
1591  Unfortunately, altering the hash table introduces serious locking complications
1592 : the entire hash table needs to be locked to enlarge the hash table, and
1593  others might be holding locks.
1594  Particularly insidious are insertions done under tdb_chainlock.
1595 \end_layout
1596
1597 \begin_layout Standard
1598
1599 \change_inserted 0 1283336187
1600 Thus an expanding layered hash will be used: an array of hash groups, with
1601  each hash group exploding into pointers to lower hash groups once it fills,
1602  turning into a hash tree.
1603  This has implications for locking: we must lock the entire group in case
1604  we need to expand it, yet we don't know how deep the tree is at that point.
1605 \end_layout
1606
1607 \begin_layout Standard
1608
1609 \change_inserted 0 1283336586
1610 Note that bits from the hash table entries should be stolen to hold more
1611  hash bits to reduce the penalty of collisions.
1612  We can use the otherwise-unused lower 3 bits.
1613  If we limit the size of the database to 64 exabytes, we can use the top
1614  8 bits of the hash entry as well.
1615  These 11 bits would reduce false positives down to 1 in 2000 which is more
1616  than we need: we can use one of the bits to indicate that the extra hash
1617  bits are valid.
1618  This means we can choose not to re-hash all entries when we expand a hash
1619  group; simply use the next bits we need and mark them invalid.
1620 \change_unchanged
1621
1622 \end_layout
1623
1624 \begin_layout Subsection
1625 \begin_inset CommandInset label
1626 LatexCommand label
1627 name "TDB-Freelist-Is"
1628
1629 \end_inset
1630
1631 TDB Freelist Is Highly Contended
1632 \end_layout
1633
1634 \begin_layout Standard
1635 TDB uses a single linked list for the free list.
1636  Allocation occurs as follows, using heuristics which have evolved over
1637  time:
1638 \end_layout
1639
1640 \begin_layout Enumerate
1641 Get the free list lock for this whole operation.
1642 \end_layout
1643
1644 \begin_layout Enumerate
1645 Multiply length by 1.25, so we always over-allocate by 25%.
1646 \end_layout
1647
1648 \begin_layout Enumerate
1649 Set the slack multiplier to 1.
1650 \end_layout
1651
1652 \begin_layout Enumerate
1653 Examine the current freelist entry: if it is > length but < the current
1654  best case, remember it as the best case.
1655 \end_layout
1656
1657 \begin_layout Enumerate
1658 Multiply the slack multiplier by 1.05.
1659 \end_layout
1660
1661 \begin_layout Enumerate
1662 If our best fit so far is less than length * slack multiplier, return it.
1663  The slack will be turned into a new free record if it's large enough.
1664 \end_layout
1665
1666 \begin_layout Enumerate
1667 Otherwise, go onto the next freelist entry.
1668 \end_layout
1669
1670 \begin_layout Standard
1671 Deleting a record occurs as follows:
1672 \end_layout
1673
1674 \begin_layout Enumerate
1675 Lock the hash chain for this whole operation.
1676 \end_layout
1677
1678 \begin_layout Enumerate
1679 Walk the chain to find the record, keeping the prev pointer offset.
1680 \end_layout
1681
1682 \begin_layout Enumerate
1683 If max_dead is non-zero:
1684 \end_layout
1685
1686 \begin_deeper
1687 \begin_layout Enumerate
1688 Walk the hash chain again and count the dead records.
1689 \end_layout
1690
1691 \begin_layout Enumerate
1692 If it's more than max_dead, bulk free all the dead ones (similar to steps
1693  4 and below, but the lock is only obtained once).
1694 \end_layout
1695
1696 \begin_layout Enumerate
1697 Simply mark this record as dead and return.
1698
1699 \end_layout
1700
1701 \end_deeper
1702 \begin_layout Enumerate
1703 Get the free list lock for the remainder of this operation.
1704 \end_layout
1705
1706 \begin_layout Enumerate
1707 \begin_inset CommandInset label
1708 LatexCommand label
1709 name "right-merging"
1710
1711 \end_inset
1712
1713 Examine the following block to see if it is free; if so, enlarge the current
1714  block and remove that block from the free list.
1715  This was disabled, as removal from the free list was O(entries-in-free-list).
1716 \end_layout
1717
1718 \begin_layout Enumerate
1719 Examine the preceeding block to see if it is free: for this reason, each
1720  block has a 32-bit tailer which indicates its length.
1721  If it is free, expand it to cover our new block and return.
1722 \end_layout
1723
1724 \begin_layout Enumerate
1725 Otherwise, prepend ourselves to the free list.
1726 \end_layout
1727
1728 \begin_layout Standard
1729 Disabling right-merging (step
1730 \begin_inset CommandInset ref
1731 LatexCommand ref
1732 reference "right-merging"
1733
1734 \end_inset
1735
1736 ) causes fragmentation; the other heuristics proved insufficient to address
1737  this, so the final answer to this was that when we expand the TDB file
1738  inside a transaction commit, we repack the entire tdb.
1739 \end_layout
1740
1741 \begin_layout Standard
1742 The single list lock limits our allocation rate; due to the other issues
1743  this is not currently seen as a bottleneck.
1744 \end_layout
1745
1746 \begin_layout Subsubsection
1747 Proposed Solution
1748 \change_deleted 0 1283336858
1749
1750 \end_layout
1751
1752 \begin_layout Standard
1753 The first step is to remove all the current heuristics, as they obviously
1754  interact, then examine them once the lock contention is addressed.
1755 \end_layout
1756
1757 \begin_layout Standard
1758 The free list must be split to reduce contention.
1759  Assuming perfect free merging, we can at most have 1 free list entry for
1760  each entry.
1761  This implies that the number of free lists is related to the size of the
1762  hash table, but as it is rare to walk a large number of free list entries
1763  we can use far fewer, say 1/32 of the number of hash buckets.
1764 \change_inserted 0 1283336910
1765
1766 \end_layout
1767
1768 \begin_layout Standard
1769
1770 \change_inserted 0 1283337052
1771 It seems tempting to try to reuse the hash implementation which we use for
1772  records here, but we have two ways of searching for free entries: for allocatio
1773 n we search by size (and possibly zone) which produces too many clashes
1774  for our hash table to handle well, and for coalescing we search by address.
1775  Thus an array of doubly-linked free lists seems preferable.
1776 \change_unchanged
1777
1778 \end_layout
1779
1780 \begin_layout Standard
1781 There are various benefits in using per-size free lists (see
1782 \begin_inset CommandInset ref
1783 LatexCommand ref
1784 reference "sub:TDB-Becomes-Fragmented"
1785
1786 \end_inset
1787
1788 ) but it's not clear this would reduce contention in the common case where
1789  all processes are allocating/freeing the same size.
1790  Thus we almost certainly need to divide in other ways: the most obvious
1791  is to divide the file into zones, and using a free list (or set of free
1792  lists) for each.
1793  This approximates address ordering.
1794 \end_layout
1795
1796 \begin_layout Standard
1797 Note that this means we need to split the free lists when we expand the
1798  file; this is probably acceptable when we double the hash table size, since
1799  that is such an expensive operation already.
1800  In the case of increasing the file size, there is an optimization we can
1801  use: if we use M in the formula above as the file size rounded up to the
1802  next power of 2, we only need reshuffle free lists when the file size crosses
1803  a power of 2 boundary,
1804 \emph on
1805 and
1806 \emph default
1807 reshuffling the free lists is trivial: we simply merge every consecutive
1808  pair of free lists.
1809 \end_layout
1810
1811 \begin_layout Standard
1812 The basic algorithm is as follows.
1813  Freeing is simple:
1814 \end_layout
1815
1816 \begin_layout Enumerate
1817 Identify the correct zone.
1818 \end_layout
1819
1820 \begin_layout Enumerate
1821 Lock the corresponding list.
1822 \end_layout
1823
1824 \begin_layout Enumerate
1825 Re-check the zone (we didn't have a lock, sizes could have changed): relock
1826  if necessary.
1827 \end_layout
1828
1829 \begin_layout Enumerate
1830 Place the freed entry in the list for that zone.
1831 \end_layout
1832
1833 \begin_layout Standard
1834 Allocation is a little more complicated, as we perform delayed coalescing
1835  at this point:
1836 \end_layout
1837
1838 \begin_layout Enumerate
1839 Pick a zone either the zone we last freed into, or based on a
1840 \begin_inset Quotes eld
1841 \end_inset
1842
1843 random
1844 \begin_inset Quotes erd
1845 \end_inset
1846
1847  number.
1848 \end_layout
1849
1850 \begin_layout Enumerate
1851 Lock the corresponding list.
1852 \end_layout
1853
1854 \begin_layout Enumerate
1855 Re-check the zone: relock if necessary.
1856 \end_layout
1857
1858 \begin_layout Enumerate
1859 If the top entry is -large enough, remove it from the list and return it.
1860 \end_layout
1861
1862 \begin_layout Enumerate
1863 Otherwise, coalesce entries in the list.If there was no entry large enough,
1864  unlock the list and try the next zone.
1865 \end_layout
1866
1867 \begin_layout Enumerate
1868 If no zone satisfies, expand the file.
1869 \end_layout
1870
1871 \begin_layout Standard
1872 This optimizes rapid insert/delete of free list entries by not coalescing
1873  them all the time..
1874  First-fit address ordering ordering seems to be fairly good for keeping
1875  fragmentation low (see
1876 \begin_inset CommandInset ref
1877 LatexCommand ref
1878 reference "sub:TDB-Becomes-Fragmented"
1879
1880 \end_inset
1881
1882 ).
1883  Note that address ordering does not need a tailer to coalesce, though if
1884  we needed one we could have one cheaply: see
1885 \begin_inset CommandInset ref
1886 LatexCommand ref
1887 reference "sub:Records-Incur-A"
1888
1889 \end_inset
1890
1891 .
1892
1893 \end_layout
1894
1895 \begin_layout Standard
1896 I anticipate that the number of entries in each free zone would be small,
1897  but it might be worth using one free entry to hold pointers to the others
1898  for cache efficiency.
1899 \change_inserted 0 1283309850
1900
1901 \end_layout
1902
1903 \begin_layout Standard
1904
1905 \change_inserted 0 1283337216
1906 \begin_inset CommandInset label
1907 LatexCommand label
1908 name "freelist-in-zone"
1909
1910 \end_inset
1911
1912 If we want to avoid locking complexity (enlarging the free lists when we
1913  enlarge the file) we could place the array of free lists at the beginning
1914  of each zone.
1915  This means existing array lists never move, but means that a record cannot
1916  be larger than a zone.
1917  That in turn implies that zones should be variable sized (say, power of
1918  2), which makes the question
1919 \begin_inset Quotes eld
1920 \end_inset
1921
1922 what zone is this record in?
1923 \begin_inset Quotes erd
1924 \end_inset
1925
1926  much harder (and
1927 \begin_inset Quotes eld
1928 \end_inset
1929
1930 pick a random zone
1931 \begin_inset Quotes erd
1932 \end_inset
1933
1934 , but that's less common).
1935  It could be done with as few as 4 bits from the record header.
1936 \begin_inset Foot
1937 status open
1938
1939 \begin_layout Plain Layout
1940
1941 \change_inserted 0 1283310945
1942 Using
1943 \begin_inset Formula $2^{16+N*3}$
1944 \end_inset
1945
1946 means 0 gives a minimal 65536-byte zone, 15 gives the maximal
1947 \begin_inset Formula $2^{61}$
1948 \end_inset
1949
1950  byte zone.
1951  Zones range in factor of 8 steps.
1952 \change_unchanged
1953
1954 \end_layout
1955
1956 \end_inset
1957
1958
1959 \change_unchanged
1960
1961 \end_layout
1962
1963 \begin_layout Subsection
1964 \begin_inset CommandInset label
1965 LatexCommand label
1966 name "sub:TDB-Becomes-Fragmented"
1967
1968 \end_inset
1969
1970 TDB Becomes Fragmented
1971 \end_layout
1972
1973 \begin_layout Standard
1974 Much of this is a result of allocation strategy
1975 \begin_inset Foot
1976 status collapsed
1977
1978 \begin_layout Plain Layout
1979 The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute
1980 xas.edu/pub/garbage/malloc/ismm98.ps
1981 \end_layout
1982
1983 \end_inset
1984
1985  and deliberate hobbling of coalescing; internal fragmentation (aka overallocati
1986 on) is deliberately set at 25%, and external fragmentation is only cured
1987  by the decision to repack the entire db when a transaction commit needs
1988  to enlarge the file.
1989 \end_layout
1990
1991 \begin_layout Subsubsection
1992 Proposed Solution
1993 \end_layout
1994
1995 \begin_layout Standard
1996 The 25% overhead on allocation works in practice for ldb because indexes
1997  tend to expand by one record at a time.
1998  This internal fragmentation can be resolved by having an
1999 \begin_inset Quotes eld
2000 \end_inset
2001
2002 expanded
2003 \begin_inset Quotes erd
2004 \end_inset
2005
2006  bit in the header to note entries that have previously expanded, and allocating
2007  more space for them.
2008 \end_layout
2009
2010 \begin_layout Standard
2011 There are is a spectrum of possible solutions for external fragmentation:
2012  one is to use a fragmentation-avoiding allocation strategy such as best-fit
2013  address-order allocator.
2014  The other end of the spectrum would be to use a bump allocator (very fast
2015  and simple) and simply repack the file when we reach the end.
2016 \end_layout
2017
2018 \begin_layout Standard
2019 There are three problems with efficient fragmentation-avoiding allocators:
2020  they are non-trivial, they tend to use a single free list for each size,
2021  and there's no evidence that tdb allocation patterns will match those recorded
2022  for general allocators (though it seems likely).
2023 \end_layout
2024
2025 \begin_layout Standard
2026 Thus we don't spend too much effort on external fragmentation; we will be
2027  no worse than the current code if we need to repack on occasion.
2028  More effort is spent on reducing freelist contention, and reducing overhead.
2029 \end_layout
2030
2031 \begin_layout Subsection
2032 \begin_inset CommandInset label
2033 LatexCommand label
2034 name "sub:Records-Incur-A"
2035
2036 \end_inset
2037
2038 Records Incur A 28-Byte Overhead
2039 \end_layout
2040
2041 \begin_layout Standard
2042 Each TDB record has a header as follows:
2043 \end_layout
2044
2045 \begin_layout LyX-Code
2046 struct tdb_record {
2047 \end_layout
2048
2049 \begin_layout LyX-Code
2050         tdb_off_t next; /* offset of the next record in the list */
2051 \end_layout
2052
2053 \begin_layout LyX-Code
2054         tdb_len_t rec_len; /* total byte length of record */
2055 \end_layout
2056
2057 \begin_layout LyX-Code
2058         tdb_len_t key_len; /* byte length of key */
2059 \end_layout
2060
2061 \begin_layout LyX-Code
2062         tdb_len_t data_len; /* byte length of data */
2063 \end_layout
2064
2065 \begin_layout LyX-Code
2066         uint32_t full_hash; /* the full 32 bit hash of the key */
2067 \end_layout
2068
2069 \begin_layout LyX-Code
2070         uint32_t magic;   /* try to catch errors */
2071 \end_layout
2072
2073 \begin_layout LyX-Code
2074         /* the following union is implied:
2075 \end_layout
2076
2077 \begin_layout LyX-Code
2078                 union {
2079 \end_layout
2080
2081 \begin_layout LyX-Code
2082                         char record[rec_len];
2083 \end_layout
2084
2085 \begin_layout LyX-Code
2086                         struct {
2087 \end_layout
2088
2089 \begin_layout LyX-Code
2090                                 char key[key_len];
2091 \end_layout
2092
2093 \begin_layout LyX-Code
2094                                 char data[data_len];
2095 \end_layout
2096
2097 \begin_layout LyX-Code
2098                         }
2099 \end_layout
2100
2101 \begin_layout LyX-Code
2102                         uint32_t totalsize; (tailer)
2103 \end_layout
2104
2105 \begin_layout LyX-Code
2106                 }
2107 \end_layout
2108
2109 \begin_layout LyX-Code
2110         */
2111 \end_layout
2112
2113 \begin_layout LyX-Code
2114 };
2115 \end_layout
2116
2117 \begin_layout Standard
2118 Naively, this would double to a 56-byte overhead on a 64 bit implementation.
2119 \end_layout
2120
2121 \begin_layout Subsubsection
2122 Proposed Solution
2123 \end_layout
2124
2125 \begin_layout Standard
2126 We can use various techniques to reduce this for an allocated block:
2127 \end_layout
2128
2129 \begin_layout Enumerate
2130 The 'next' pointer is not required, as we are using a flat hash table.
2131 \end_layout
2132
2133 \begin_layout Enumerate
2134 'rec_len' can instead be expressed as an addition to key_len and data_len
2135  (it accounts for wasted or overallocated length in the record).
2136  Since the record length is always a multiple of 8, we can conveniently
2137  fit it in 32 bits (representing up to 35 bits).
2138 \end_layout
2139
2140 \begin_layout Enumerate
2141 'key_len' and 'data_len' can be reduced.
2142  I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine
2143  the two into one 64-bit field and using a 5 bit value which indicates at
2144  what bit to divide the two.
2145  Keys are unlikely to scale as fast as data, so I'm assuming a maximum key
2146  size of 32 bits.
2147 \end_layout
2148
2149 \begin_layout Enumerate
2150 'full_hash' is used to avoid a memcmp on the
2151 \begin_inset Quotes eld
2152 \end_inset
2153
2154 miss
2155 \begin_inset Quotes erd
2156 \end_inset
2157
2158  case, but this is diminishing returns after a handful of bits (at 10 bits,
2159  it reduces 99.9% of false memcmp).
2160  As an aside, as the lower bits are already incorporated in the hash table
2161  resolution, the upper bits should be used here.
2162
2163 \change_inserted 0 1283336739
2164  Note that it's not clear that these bits will be a win, given the extra
2165  bits in the hash table itself (see
2166 \begin_inset CommandInset ref
2167 LatexCommand ref
2168 reference "sub:Hash-Size-Solution"
2169
2170 \end_inset
2171
2172 ).
2173 \change_unchanged
2174
2175 \end_layout
2176
2177 \begin_layout Enumerate
2178 'magic' does not need to be enlarged: it currently reflects one of 5 values
2179  (used, free, dead, recovery, and unused_recovery).
2180  It is useful for quick sanity checking however, and should not be eliminated.
2181 \end_layout
2182
2183 \begin_layout Enumerate
2184 'tailer' is only used to coalesce free blocks (so a block to the right can
2185  find the header to check if this block is free).
2186  This can be replaced by a single 'free' bit in the header of the following
2187  block (and the tailer only exists in free blocks).
2188 \begin_inset Foot
2189 status collapsed
2190
2191 \begin_layout Plain Layout
2192 This technique from Thomas Standish.
2193  Data Structure Techniques.
2194  Addison-Wesley, Reading, Massachusetts, 1980.
2195 \end_layout
2196
2197 \end_inset
2198
2199  The current proposed coalescing algorithm doesn't need this, however.
2200 \end_layout
2201
2202 \begin_layout Standard
2203 This produces a 16 byte used header like this:
2204 \end_layout
2205
2206 \begin_layout LyX-Code
2207 struct tdb_used_record {
2208 \end_layout
2209
2210 \begin_layout LyX-Code
2211         uint32_t magic : 16,
2212 \end_layout
2213
2214 \begin_layout LyX-Code
2215                  prev_is_free: 1,
2216 \end_layout
2217
2218 \begin_layout LyX-Code
2219                  key_data_divide: 5,
2220 \end_layout
2221
2222 \begin_layout LyX-Code
2223                  top_hash: 10;
2224 \end_layout
2225
2226 \begin_layout LyX-Code
2227         uint32_t extra_octets;
2228 \end_layout
2229
2230 \begin_layout LyX-Code
2231         uint64_t key_and_data_len;
2232 \end_layout
2233
2234 \begin_layout LyX-Code
2235 };
2236 \end_layout
2237
2238 \begin_layout Standard
2239 And a free record like this:
2240 \end_layout
2241
2242 \begin_layout LyX-Code
2243 struct tdb_free_record {
2244 \end_layout
2245
2246 \begin_layout LyX-Code
2247         uint32_t free_magic;
2248 \end_layout
2249
2250 \begin_layout LyX-Code
2251         uint64_t total_length;
2252 \change_inserted 0 1283337133
2253
2254 \end_layout
2255
2256 \begin_layout LyX-Code
2257
2258 \change_inserted 0 1283337139
2259         uint64_t prev, next;
2260 \change_unchanged
2261
2262 \end_layout
2263
2264 \begin_layout LyX-Code
2265         ...
2266 \end_layout
2267
2268 \begin_layout LyX-Code
2269         uint64_t tailer;
2270 \end_layout
2271
2272 \begin_layout LyX-Code
2273 };
2274 \end_layout
2275
2276 \begin_layout Standard
2277
2278 \change_inserted 0 1283337235
2279 We might want to take some bits from the used record's top_hash (and the
2280  free record which has 32 bits of padding to spare anyway) if we use variable
2281  sized zones.
2282  See
2283 \begin_inset CommandInset ref
2284 LatexCommand ref
2285 reference "freelist-in-zone"
2286
2287 \end_inset
2288
2289 .
2290 \change_unchanged
2291
2292 \end_layout
2293
2294 \begin_layout Subsection
2295 Transaction Commit Requires 4 fdatasync
2296 \end_layout
2297
2298 \begin_layout Standard
2299 The current transaction algorithm is:
2300 \end_layout
2301
2302 \begin_layout Enumerate
2303 write_recovery_data();
2304 \end_layout
2305
2306 \begin_layout Enumerate
2307 sync();
2308 \end_layout
2309
2310 \begin_layout Enumerate
2311 write_recovery_header();
2312 \end_layout
2313
2314 \begin_layout Enumerate
2315 sync();
2316 \end_layout
2317
2318 \begin_layout Enumerate
2319 overwrite_with_new_data();
2320 \end_layout
2321
2322 \begin_layout Enumerate
2323 sync();
2324 \end_layout
2325
2326 \begin_layout Enumerate
2327 remove_recovery_header();
2328 \end_layout
2329
2330 \begin_layout Enumerate
2331 sync();
2332 \end_layout
2333
2334 \begin_layout Standard
2335 On current ext3, each sync flushes all data to disk, so the next 3 syncs
2336  are relatively expensive.
2337  But this could become a performance bottleneck on other filesystems such
2338  as ext4.
2339 \end_layout
2340
2341 \begin_layout Subsubsection
2342 Proposed Solution
2343 \end_layout
2344
2345 \begin_layout Standard
2346 Neil Brown points out that this is overzealous, and only one sync is needed:
2347 \end_layout
2348
2349 \begin_layout Enumerate
2350 Bundle the recovery data, a transaction counter and a strong checksum of
2351  the new data.
2352 \end_layout
2353
2354 \begin_layout Enumerate
2355 Strong checksum that whole bundle.
2356 \end_layout
2357
2358 \begin_layout Enumerate
2359 Store the bundle in the database.
2360 \end_layout
2361
2362 \begin_layout Enumerate
2363 Overwrite the oldest of the two recovery pointers in the header (identified
2364  using the transaction counter) with the offset of this bundle.
2365 \end_layout
2366
2367 \begin_layout Enumerate
2368 sync.
2369 \end_layout
2370
2371 \begin_layout Enumerate
2372 Write the new data to the file.
2373 \end_layout
2374
2375 \begin_layout Standard
2376 Checking for recovery means identifying the latest bundle with a valid checksum
2377  and using the new data checksum to ensure that it has been applied.
2378  This is more expensive than the current check, but need only be done at
2379  open.
2380  For running databases, a separate header field can be used to indicate
2381  a transaction in progress; we need only check for recovery if this is set.
2382 \end_layout
2383
2384 \begin_layout Subsection
2385 \begin_inset CommandInset label
2386 LatexCommand label
2387 name "sub:TDB-Does-Not"
2388
2389 \end_inset
2390
2391 TDB Does Not Have Snapshot Support
2392 \end_layout
2393
2394 \begin_layout Subsubsection
2395 Proposed Solution
2396 \end_layout
2397
2398 \begin_layout Standard
2399 None.
2400  At some point you say
2401 \begin_inset Quotes eld
2402 \end_inset
2403
2404 use a real database
2405 \begin_inset Quotes erd
2406 \end_inset
2407
2408 .
2409 \end_layout
2410
2411 \begin_layout Standard
2412 But as a thought experiment, if we implemented transactions to only overwrite
2413  free entries (this is tricky: there must not be a header in each entry
2414  which indicates whether it is free, but use of presence in metadata elsewhere),
2415  and a pointer to the hash table, we could create an entirely new commit
2416  without destroying existing data.
2417  Then it would be easy to implement snapshots in a similar way.
2418 \end_layout
2419
2420 \begin_layout Standard
2421 This would not allow arbitrary changes to the database, such as tdb_repack
2422  does, and would require more space (since we have to preserve the current
2423  and future entries at once).
2424  If we used hash trees rather than one big hash table, we might only have
2425  to rewrite some sections of the hash, too.
2426 \end_layout
2427
2428 \begin_layout Standard
2429 We could then implement snapshots using a similar method, using multiple
2430  different hash tables/free tables.
2431 \end_layout
2432
2433 \begin_layout Subsection
2434 Transactions Cannot Operate in Parallel
2435 \end_layout
2436
2437 \begin_layout Standard
2438 This would be useless for ldb, as it hits the index records with just about
2439  every update.
2440  It would add significant complexity in resolving clashes, and cause the
2441  all transaction callers to write their code to loop in the case where the
2442  transactions spuriously failed.
2443 \end_layout
2444
2445 \begin_layout Subsubsection
2446 Proposed Solution
2447 \end_layout
2448
2449 \begin_layout Standard
2450 We could solve a small part of the problem by providing read-only transactions.
2451  These would allow one write transaction to begin, but it could not commit
2452  until all r/o transactions are done.
2453  This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
2454  commit.
2455 \end_layout
2456
2457 \begin_layout Subsection
2458 Default Hash Function Is Suboptimal
2459 \end_layout
2460
2461 \begin_layout Standard
2462 The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially
2463  if we expand it to 64 bits), and works best when the hash bucket size is
2464  a prime number (which also means a slow modulus).
2465  In addition, it is highly predictable which could potentially lead to a
2466  Denial of Service attack in some TDB uses.
2467 \end_layout
2468
2469 \begin_layout Subsubsection
2470 Proposed Solution
2471 \end_layout
2472
2473 \begin_layout Standard
2474 The Jenkins lookup3 hash
2475 \begin_inset Foot
2476 status open
2477
2478 \begin_layout Plain Layout
2479 http://burtleburtle.net/bob/c/lookup3.c
2480 \end_layout
2481
2482 \end_inset
2483
2484  is a fast and superbly-mixing hash.
2485  It's used by the Linux kernel and almost everything else.
2486  This has the particular properties that it takes an initial seed, and produces
2487  two 32 bit hash numbers, which we can combine into a 64-bit hash.
2488 \end_layout
2489
2490 \begin_layout Standard
2491 The seed should be created at tdb-creation time from some random source,
2492  and placed in the header.
2493  This is far from foolproof, but adds a little bit of protection against
2494  hash bombing.
2495 \end_layout
2496
2497 \begin_layout Subsection
2498 \begin_inset CommandInset label
2499 LatexCommand label
2500 name "Reliable-Traversal-Adds"
2501
2502 \end_inset
2503
2504 Reliable Traversal Adds Complexity
2505 \end_layout
2506
2507 \begin_layout Standard
2508 We lock a record during traversal iteration, and try to grab that lock in
2509  the delete code.
2510  If that grab on delete fails, we simply mark it deleted and continue onwards;
2511  traversal checks for this condition and does the delete when it moves off
2512  the record.
2513 \end_layout
2514
2515 \begin_layout Standard
2516 If traversal terminates, the dead record may be left indefinitely.
2517 \end_layout
2518
2519 \begin_layout Subsubsection
2520 Proposed Solution
2521 \end_layout
2522
2523 \begin_layout Standard
2524 Remove reliability guarantees; see
2525 \begin_inset CommandInset ref
2526 LatexCommand ref
2527 reference "traverse-Proposed-Solution"
2528
2529 \end_inset
2530
2531 .
2532 \end_layout
2533
2534 \begin_layout Subsection
2535 Fcntl Locking Adds Overhead
2536 \end_layout
2537
2538 \begin_layout Standard
2539 Placing a fcntl lock means a system call, as does removing one.
2540  This is actually one reason why transactions can be faster (everything
2541  is locked once at transaction start).
2542  In the uncontended case, this overhead can theoretically be eliminated.
2543 \end_layout
2544
2545 \begin_layout Subsubsection
2546 Proposed Solution
2547 \end_layout
2548
2549 \begin_layout Standard
2550 None.
2551 \end_layout
2552
2553 \begin_layout Standard
2554 We tried this before with spinlock support, in the early days of TDB, and
2555  it didn't make much difference except in manufactured benchmarks.
2556 \end_layout
2557
2558 \begin_layout Standard
2559 We could use spinlocks (with futex kernel support under Linux), but it means
2560  that we lose automatic cleanup when a process dies with a lock.
2561  There is a method of auto-cleanup under Linux, but it's not supported by
2562  other operating systems.
2563  We could reintroduce a clear-if-first-style lock and sweep for dead futexes
2564  on open, but that wouldn't help the normal case of one concurrent opener
2565  dying.
2566  Increasingly elaborate repair schemes could be considered, but they require
2567  an ABI change (everyone must use them) anyway, so there's no need to do
2568  this at the same time as everything else.
2569 \end_layout
2570
2571 \begin_layout Subsection
2572 Some Transactions Don't Require Durability
2573 \end_layout
2574
2575 \begin_layout Standard
2576 Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast)
2577  usage, and occasionally empties the results into a transactional TDB.
2578  This kind of usage prioritizes performance over durability: as long as
2579  we are consistent, data can be lost.
2580 \end_layout
2581
2582 \begin_layout Standard
2583 This would be more neatly implemented inside tdb: a
2584 \begin_inset Quotes eld
2585 \end_inset
2586
2587 soft
2588 \begin_inset Quotes erd
2589 \end_inset
2590
2591  transaction commit (ie.
2592  syncless) which meant that data may be reverted on a crash.
2593 \end_layout
2594
2595 \begin_layout Subsubsection
2596 Proposed Solution
2597 \end_layout
2598
2599 \begin_layout Standard
2600 None.
2601 \end_layout
2602
2603 \begin_layout Standard
2604 Unfortunately any transaction scheme which overwrites old data requires
2605  a sync before that overwrite to avoid the possibility of corruption.
2606 \end_layout
2607
2608 \begin_layout Standard
2609 It seems possible to use a scheme similar to that described in
2610 \begin_inset CommandInset ref
2611 LatexCommand ref
2612 reference "sub:TDB-Does-Not"
2613
2614 \end_inset
2615
2616 ,where transactions are committed without overwriting existing data, and
2617  an array of top-level pointers were available in the header.
2618  If the transaction is
2619 \begin_inset Quotes eld
2620 \end_inset
2621
2622 soft
2623 \begin_inset Quotes erd
2624 \end_inset
2625
2626  then we would not need a sync at all: existing processes would pick up
2627  the new hash table and free list and work with that.
2628 \end_layout
2629
2630 \begin_layout Standard
2631 At some later point, a sync would allow recovery of the old data into the
2632  free lists (perhaps when the array of top-level pointers filled).
2633  On crash, tdb_open() would examine the array of top levels, and apply the
2634  transactions until it encountered an invalid checksum.
2635 \end_layout
2636
2637 \end_body
2638 \end_document
2639 @
2640
2641
2642 1.8
2643 log
2644 @Remove bogus footnote
2645 @
2646 text
2647 @d56 2
2648 a57 2
2649 \change_inserted 0 1283307544
2650 1-September
2651 d838 12
2652 d1198 103
2653 @
2654
2655
2656 1.7
2657 log
2658 @Moving hash table does not work.
2659 @
2660 text
2661 @a1436 12
2662 \begin_inset Foot
2663 status collapsed
2664
2665 \begin_layout Plain Layout
2666
2667 \change_inserted 0 1283336450
2668 If we make the hash offsets zone-relative, then this only restricts the
2669  zone size, not the overall database size.
2670 \end_layout
2671
2672 \end_inset
2673
2674 @
2675
2676
2677 1.6
2678 log
2679 @Commit changes
2680 @
2681 text
2682 @d38 1
2683 a38 1
2684 \author ""
2685 d53 7
2686 a59 1
2687 26-July-2010
2688 d1333 10
2689 d1361 3
2690 a1363 1
2691  There are three details which become important:
2692 d1367 2
2693 d1373 2
2694 d1379 2
2695 d1385 2
2696 d1397 2
2697 d1407 2
2698 d1411 45
2699 d1582 2
2700 d1598 14
2701 d1733 62
2702 d1996 13
2703 d2086 10
2704 d2110 15
2705 a2124 1
2706 \begin_layout LyX-Code
2707 @
2708
2709
2710 1.5
2711 log
2712 @Soft transaction commit
2713 @
2714 text
2715 @d38 1
2716 a38 1
2717 \author "Rusty Russell,,,"
2718 a52 4
2719
2720 \change_deleted 0 1280141199
2721 10-May-2010
2722 \change_inserted 0 1280141202
2723 a53 2
2724 \change_unchanged
2725
2726 a2028 2
2727
2728 \change_inserted 0 1280140902
2729 a2034 2
2730
2731 \change_unchanged
2732 a2212 2
2733 \change_inserted 0 1280140661
2734
2735 a2215 2
2736
2737 \change_inserted 0 1280140703
2738 a2219 2
2739
2740 \change_inserted 0 1280708312
2741 a2226 2
2742
2743 \change_inserted 0 1280708400
2744 a2239 2
2745
2746 \change_inserted 0 1280140836
2747 a2243 2
2748
2749 \change_inserted 0 1280708255
2750 a2247 2
2751
2752 \change_inserted 0 1280708374
2753 a2252 2
2754
2755 \change_inserted 0 1280141181
2756 a2274 2
2757
2758 \change_inserted 0 1280141345
2759 @
2760
2761
2762 1.4
2763 log
2764 @Merge changes
2765 @
2766 text
2767 @d38 1
2768 a38 1
2769 \author ""
2770 d53 2
2771 d56 4
2772 d2035 10
2773 d2223 84
2774 @
2775
2776
2777 1.3
2778 log
2779 @Transaction and freelist rethink.
2780 @
2781 text
2782 @d38 1
2783 a38 1
2784 \author "Rusty Russell,,,"
2785 d53 1
2786 a53 1
2787 27-April-2010
2788 d662 1
2789 a662 5
2790  behavior of disallowing
2791 \change_inserted 0 1272940179
2792 nested
2793 \change_unchanged
2794 transactions should become the default.
2795 a1210 2
2796 \change_inserted 0 1272944650
2797
2798 a1214 2
2799
2800 \change_inserted 0 1272944763
2801 a1218 2
2802 \change_unchanged
2803
2804 a1223 2
2805 \change_unchanged
2806
2807 a1301 2
2808
2809 \change_inserted 0 1273478114
2810 a1310 2
2811 \change_unchanged
2812
2813 d1515 1
2814 a1515 11
2815 The free list
2816 \change_deleted 0 1273469807
2817 should
2818 \change_inserted 0 1273469810
2819 must
2820 \change_unchanged
2821  be split
2822 \change_deleted 0 1273469815
2823 into multiple lists
2824 \change_unchanged
2825 to reduce contention.
2826 a1520 2
2827 \change_inserted 0 1273470006
2828
2829 a1523 2
2830
2831 \change_inserted 0 1273492055
2832 a1539 2
2833
2834 \change_inserted 0 1273483888
2835 a1551 2
2836 \change_unchanged
2837
2838 a1554 8
2839
2840 \change_deleted 0 1272942055
2841 There are various ways to organize these lisys, but because we want to be
2842  able to quickly identify which free list an entry is in, and reduce the
2843  number of locks required for merging, we will use zoning (eg.
2844  each free list covers some fixed fraction of the file).
2845
2846 \change_inserted 0 1273484187
2847 d1556 1
2848 a1556 7
2849
2850 \change_deleted 0 1273484194
2851 The algorithm for f
2852 \change_inserted 0 1273484194
2853 F
2854 \change_unchanged
2855 reeing is simple:
2856 d1560 1
2857 a1560 7
2858 Identify the correct
2859 \change_deleted 0 1273482856
2860 free list
2861 \change_inserted 0 1273482857
2862 zone
2863 \change_unchanged
2864 .
2865 d1564 1
2866 a1564 7
2867 Lock the
2868 \change_inserted 0 1273482895
2869 corresponding
2870 \change_unchanged
2871 list
2872 \change_inserted 0 1273482863
2873 .
2874 a1567 2
2875
2876 \change_inserted 0 1273482909
2877 d1573 1
2878 a1573 13
2879
2880 \change_deleted 0 1273482885
2881 , and p
2882 \change_inserted 0 1273482888
2883 P
2884 \change_unchanged
2885 lace the freed entry
2886 \change_deleted 0 1273492415
2887 at the head
2888 \change_inserted 0 1273492415
2889 in the list for that zone
2890 \change_unchanged
2891 .
2892 d1577 2
2893 a1578 7
2894 Allocation is a little more complicated, as we
2895 \change_deleted 0 1273483240
2896 merge entries as we walk the list:
2897 \change_inserted 0 1273484250
2898 perform delayed coalescing at this point:
2899 \change_unchanged
2900
2901 d1582 1
2902 a1582 19
2903 Pick a
2904 \change_deleted 0 1273482955
2905 free list;
2906 \change_inserted 0 1273482957
2907 zone
2908 \change_unchanged
2909  either the
2910 \change_deleted 0 1273482962
2911 list
2912 \change_inserted 0 1273482962
2913 zone
2914 \change_unchanged
2915  we last freed
2916 \change_deleted 0 1273482966
2917 o
2918 \change_inserted 0 1273482966
2919 i
2920 \change_unchanged
2921 nto, or based on a
2922 d1594 1
2923 a1594 9
2924 Lock th
2925 \change_inserted 0 1273482980
2926 e corresponding
2927 \change_deleted 0 1273482973
2928 at
2929 \change_unchanged
2930  list.
2931 \change_inserted 0 1273482982
2932
2933 a1597 2
2934
2935 \change_inserted 0 1273483084
2936 a1598 53
2937 \change_unchanged
2938
2939 \end_layout
2940
2941 \begin_layout Enumerate
2942 If the top entry is
2943 \change_deleted 0 1273492155
2944 well-sized,
2945 \change_inserted 0 1273492159
2946 -large enough,
2947 \change_unchanged
2948 remove it from the list and return it.
2949 \end_layout
2950
2951 \begin_layout Enumerate
2952 Otherwise,
2953 \change_inserted 0 1273492206
2954 coalesce entries in the list.
2955 \change_deleted 0 1273492200
2956 examine the entry to the right of it in the file.
2957  If it is free:
2958 \end_layout
2959
2960 \begin_deeper
2961 \begin_layout Enumerate
2962
2963 \change_deleted 0 1273492200
2964 If that entry is in a different list, lock that list too.
2965 \end_layout
2966
2967 \begin_layout Enumerate
2968
2969 \change_deleted 0 1273492200
2970 If we had to place a new lock, re-check that the entry is free.
2971 \end_layout
2972
2973 \begin_layout Enumerate
2974
2975 \change_deleted 0 1273492200
2976 Remove that entry from its free list and expand this entry to cover it.
2977 \end_layout
2978
2979 \begin_layout Enumerate
2980
2981 \change_deleted 0 1273485554
2982 Goto step 3.
2983 \end_layout
2984
2985 \end_deeper
2986 \begin_layout Enumerate
2987
2988 \change_inserted 0 1273485311
2989 If there was no entry large enough, unlock the list and try the next zone.
2990 d1602 1
2991 a1602 5
2992
2993 \change_deleted 0 1273483646
2994 Repeat step 3 with each entry in the list.
2995 \change_unchanged
2996
2997 d1606 2
2998 a1607 5
2999
3000 \change_deleted 0 1273483668
3001 Unlock the list and repeat step 2 with the next list.
3002 \change_unchanged
3003
3004 d1611 1
3005 a1611 7
3006 If no
3007 \change_deleted 0 1273483671
3008 list
3009 \change_inserted 0 1273483671
3010 zone
3011 \change_unchanged
3012  satisfies, expand the file.
3013 d1615 2
3014 a1616 9
3015 This optimizes rapid insert/delete of free list entries
3016 \change_inserted 0 1273485794
3017  by not coalescing them all the time.
3018 \change_deleted 0 1273483685
3019 , and allows us to get rid of the tailer altogether
3020 \change_unchanged
3021 .
3022
3023 \change_inserted 0 1273492299
3024 a1638 39
3025
3026 \change_deleted 0 1273476840
3027 The question of
3028 \begin_inset Quotes eld
3029 \end_inset
3030
3031 well-sized
3032 \begin_inset Quotes erd
3033 \end_inset
3034
3035  free entries is more difficult: the 25% overhead works in practice for
3036  ldb because indexes tend to expand by one record at a time.
3037  This can be resolved by having an
3038 \begin_inset Quotes eld
3039 \end_inset
3040
3041 expanded
3042 \begin_inset Quotes erd
3043 \end_inset
3044
3045  bit in the header to note entries that have previously expanded, and allocating
3046  more space for them.
3047  Whether the
3048 \begin_inset Quotes eld
3049 \end_inset
3050
3051 increasing slack
3052 \begin_inset Quotes erd
3053 \end_inset
3054
3055  algorithm should be implemented or first-fit used is still unknown: we
3056  will determine this once these other ideas are implemented.
3057 \change_inserted 0 1273483750
3058
3059 \end_layout
3060
3061 \begin_layout Standard
3062
3063 \change_inserted 0 1273492450
3064 a1644 2
3065
3066 \change_inserted 0 1273470441
3067 a1654 2
3068
3069 \change_inserted 0 1273476556
3070 a1659 2
3071
3072 \change_inserted 0 1273470423
3073 a1661 2
3074 \change_unchanged
3075
3076 a1672 2
3077
3078 \change_inserted 0 1273476847
3079 a1676 2
3080
3081 \change_inserted 0 1273476886
3082 a1691 2
3083
3084 \change_inserted 0 1273477233
3085 a1699 2
3086
3087 \change_inserted 0 1273477534
3088 a1706 2
3089
3090 \change_inserted 0 1273482700
3091 a1712 2
3092
3093 \change_inserted 0 1273478079
3094 a1722 2
3095
3096 \change_inserted 0 1273477839
3097 a1726 2
3098
3099 \change_inserted 0 1273477925
3100 a1730 2
3101
3102 \change_inserted 0 1273477925
3103 a1734 2
3104
3105 \change_inserted 0 1273477925
3106 a1738 2
3107
3108 \change_inserted 0 1273477925
3109 a1742 2
3110
3111 \change_inserted 0 1273477925
3112 a1746 2
3113
3114 \change_inserted 0 1273477925
3115 a1750 2
3116
3117 \change_inserted 0 1273477925
3118 a1754 2
3119
3120 \change_inserted 0 1273477925
3121 a1758 2
3122
3123 \change_inserted 0 1273477925
3124 a1762 2
3125
3126 \change_inserted 0 1273477925
3127 a1766 2
3128
3129 \change_inserted 0 1273477925
3130 a1770 2
3131
3132 \change_inserted 0 1273477925
3133 a1774 2
3134
3135 \change_inserted 0 1273477925
3136 a1778 2
3137
3138 \change_inserted 0 1273477925
3139 a1782 2
3140
3141 \change_inserted 0 1273477925
3142 a1786 2
3143
3144 \change_inserted 0 1273477925
3145 a1790 2
3146
3147 \change_inserted 0 1273477925
3148 a1794 2
3149
3150 \change_inserted 0 1273477925
3151 a1798 2
3152
3153 \change_inserted 0 1273492522
3154 a1802 2
3155
3156 \change_inserted 0 1273492530
3157 a1806 2
3158
3159 \change_inserted 0 1273492546
3160 a1810 2
3161
3162 \change_inserted 0 1273478239
3163 a1814 2
3164
3165 \change_inserted 0 1273479960
3166 a1821 2
3167
3168 \change_inserted 0 1273480265
3169 a1830 2
3170
3171 \change_inserted 0 1273480354
3172 a1845 2
3173
3174 \change_inserted 0 1273478968
3175 a1851 2
3176
3177 \change_inserted 0 1273492604
3178 a1859 2
3179
3180 \change_inserted 0 1273479572
3181 a1862 2
3182 \change_unchanged
3183
3184 a1870 2
3185
3186 \change_inserted 0 1273480282
3187 a1874 2
3188
3189 \change_inserted 0 1273478931
3190 a1878 2
3191
3192 \change_inserted 0 1273481549
3193 a1882 2
3194
3195 \change_inserted 0 1273481557
3196 a1886 2
3197
3198 \change_inserted 0 1273480307
3199 a1890 2
3200
3201 \change_inserted 0 1273480335
3202 a1894 2
3203
3204 \change_inserted 0 1273479897
3205 a1898 2
3206
3207 \change_inserted 0 1273479653
3208 a1902 2
3209
3210 \change_inserted 0 1273480371
3211 a1906 2
3212
3213 \change_inserted 0 1273480464
3214 a1910 2
3215
3216 \change_inserted 0 1273480399
3217 a1914 2
3218
3219 \change_inserted 0 1273480425
3220 a1918 2
3221
3222 \change_inserted 0 1273480453
3223 a1922 2
3224
3225 \change_inserted 0 1273480455
3226 a1926 2
3227
3228 \change_inserted 0 1273480450
3229 a1930 2
3230
3231 \change_inserted 0 1273480452
3232 a1935 2
3233 \change_inserted 0 1273478830
3234
3235 a1942 5
3236
3237 \change_deleted 0 1273481604
3238 In theory, we could get away with 2: one after we write the new data, and
3239  one to somehow atomically change over to it.
3240 \change_inserted 0 1273481632
3241 a1946 2
3242
3243 \change_inserted 0 1273481724
3244 a1950 2
3245
3246 \change_inserted 0 1273481713
3247 a1954 2
3248
3249 \change_inserted 0 1273481717
3250 a1958 2
3251
3252 \change_inserted 0 1273481730
3253 a1962 2
3254
3255 \change_inserted 0 1273481736
3256 a1966 2
3257
3258 \change_inserted 0 1273481744
3259 a1970 2
3260
3261 \change_inserted 0 1273481748
3262 a1974 2
3263
3264 \change_inserted 0 1273482185
3265 a1978 2
3266
3267 \change_inserted 0 1273482259
3268 a1989 50
3269
3270 \change_deleted 0 1273481848
3271 None.
3272  Trying to rewrite the transaction code is a separate experiment, which
3273  I encourage someone else to do.
3274  At some point you say
3275 \begin_inset Quotes eld
3276 \end_inset
3277
3278 use a real database
3279 \begin_inset Quotes erd
3280 \end_inset
3281
3282 .
3283 \end_layout
3284
3285 \begin_layout Standard
3286
3287 \change_deleted 0 1273481848
3288 But as a thought experiment:
3289 \change_unchanged
3290
3291 \end_layout
3292
3293 \begin_layout Standard
3294
3295 \change_deleted 0 1273481788
3296 Say there was a pointer in the header which said where the hash table and
3297  free list tables were, and that no blocks were labeled with whether they
3298  were free or not (it had to be derived from what list they were in).
3299  We could create new hash table and free list in some free space, and populate
3300  it as we want the post-committed state to look.
3301  Then we sync, then we switch the offset in the header, then we sync again.
3302 \end_layout
3303
3304 \begin_layout Standard
3305
3306 \change_deleted 0 1273481788
3307 This would not allow arbitrary changes to the database, such as tdb_repack
3308  does, and would require more space (since we have to preserve the current
3309  and future entries at once).
3310  If we used hash trees rather than one big hash table, we might only have
3311  to rewrite some sections of the hash, too.
3312 \change_inserted 0 1273481854
3313
3314 \end_layout
3315
3316 \begin_layout Standard
3317
3318 \change_inserted 0 1273482102
3319 a1993 2
3320
3321 \change_inserted 0 1273482061
3322 a1998 2
3323
3324 \change_inserted 0 1273482063
3325 a2002 2
3326
3327 \change_inserted 0 1273482072
3328 a2006 2
3329
3330 \change_inserted 0 1273482139
3331 a2011 2
3332
3333 \change_inserted 0 1273482364
3334 a2015 2
3335
3336 \change_inserted 0 1273482163
3337 a2019 2
3338
3339 \change_inserted 0 1273482493
3340 a2037 2
3341
3342 \change_inserted 0 1273482536
3343 a2046 2
3344 \change_unchanged
3345
3346 a2049 2
3347
3348 \change_inserted 0 1273482641
3349 a2058 2
3350
3351 \change_inserted 0 1273481827
3352 d2067 2
3353 a2068 11
3354 We could
3355 \change_inserted 0 1273481829
3356 then
3357 \change_unchanged
3358 implement snapshots using a similar method
3359 \change_deleted 0 1273481838
3360  to the above, only
3361 \change_inserted 0 1273481840
3362 ,
3363 \change_unchanged
3364  using multiple different hash tables/free tables.
3365 @
3366
3367
3368 1.2
3369 log
3370 @After first feedback (Ronnie & Volker)
3371 @
3372 text
3373 @d1314 13
3374 d1531 11
3375 a1541 1
3376 The free list should be split into multiple lists to reduce contention.
3377 d1547 39
3378 d1596 7
3379 d1604 1
3380 a1604 1
3381 The algorithm for freeing is simple:
3382 d1608 7
3383 a1614 1
3384 Identify the correct free list.
3385 d1618 30
3386 a1647 1
3387 Lock the list, and place the freed entry at the head.
3388 d1651 7
3389 a1657 2
3390 Allocation is a little more complicated, as we merge entries as we walk
3391  the list:
3392 d1661 19
3393 a1679 1
3394 Pick a free list; either the list we last freed onto, or based on a
3395 d1691 17
3396 a1707 1
3397 Lock that list.
3398 d1711 7
3399 a1717 1
3400 If the top entry is well-sized, remove it from the list and return it.
3401 d1721 5
3402 a1725 1
3403 Otherwise, examine the entry to the right of it in the file.
3404 d1731 2
3405 d1737 2
3406 d1743 2
3407 d1749 2
3408 d1756 8
3409 d1765 2
3410 d1770 2
3411 d1773 2
3412 d1778 7
3413 a1784 1
3414 If no list satisfies, expand the file.
3415 d1788 28
3416 a1815 2
3417 This optimizes rapid insert/delete of free list entries, and allows us to
3418  get rid of the tailer altogether.
3419 d1819 2
3420 d1851 1
3421 a1851 1
3422 \change_inserted 0 1272941474
3423 d1857 303
3424 a2159 18
3425 \change_inserted 0 1272942759
3426 There are various ways to organize these lists, but because we want to be
3427  able to quickly identify which free list an entry is in, and reduce the
3428  number of locks required for merging, we will use zoning (eg.
3429  each of the N free lists in a tdb file of size M covers a fixed fraction
3430  M/N).
3431  Note that this means we need to reshuffle the free lists when we expand
3432  the file; this is probably acceptable when we double the hash table size,
3433  since that is such an expensive operation already.
3434  In the case of increasing the file size, there is an optimization we can
3435  use: if we use M in the formula above as the file size rounded up to the
3436  next power of 2, we only need reshuffle free lists when the file size crosses
3437  a power of 2 boundary,
3438 \emph on
3439 and
3440 \emph default
3441 reshuffling the free lists is trivial: we simply merge every consecutive
3442  pair of free lists.
3443 d2164 107
3444 d2276 2
3445 d2280 59
3446 d2346 2
3447 d2363 2
3448 d2366 2
3449 d2371 2
3450 d2382 2
3451 d2389 57
3452 d2458 13
3453 d2474 32
3454 a2505 2
3455 We could implement snapshots using a similar method to the above, only using
3456  multiple different hash tables/free tables.
3457 @
3458
3459
3460 1.1
3461 log
3462 @Initial revision
3463 @
3464 text
3465 @d1 1
3466 a1 1
3467 #LyX 1.6.4 created this file. For more info see http://www.lyx.org/
3468 d36 3
3469 a38 3
3470 \tracking_changes false
3471 \output_changes false
3472 \author ""
3473 d662 5
3474 a666 1
3475  behavior of disallowing transactions should become the default.
3476 d1215 21
3477 d1527 2
3478 d1533 3
3479 a1535 1
3480  The algorithm for freeing is simple:
3481 d1642 26
3482 @