git.ozlabs.org Git - ccan/blob - ccan/tdb2/doc/design.lyx,v

   1 head    1.10;
   2 access;
   3 symbols;
   4 locks; strict;
   5 comment @# @;
   6
   7
   8 1.10
   9 date    2010.09.14.00.33.57;    author rusty;   state Exp;
  10 branches;
  11 next    1.9;
  12
  13 1.9
  14 date    2010.09.09.07.25.12;    author rusty;   state Exp;
  15 branches;
  16 next    1.8;
  17
  18 1.8
  19 date    2010.09.02.02.29.05;    author rusty;   state Exp;
  20 branches;
  21 next    1.7;
  22
  23 1.7
  24 date    2010.09.01.10.58.12;    author rusty;   state Exp;
  25 branches;
  26 next    1.6;
  27
  28 1.6
  29 date    2010.08.02.00.21.43;    author rusty;   state Exp;
  30 branches;
  31 next    1.5;
  32
  33 1.5
  34 date    2010.08.02.00.21.16;    author rusty;   state Exp;
  35 branches;
  36 next    1.4;
  37
  38 1.4
  39 date    2010.05.10.13.09.11;    author rusty;   state Exp;
  40 branches;
  41 next    1.3;
  42
  43 1.3
  44 date    2010.05.10.11.58.37;    author rusty;   state Exp;
  45 branches;
  46 next    1.2;
  47
  48 1.2
  49 date    2010.05.10.05.35.13;    author rusty;   state Exp;
  50 branches;
  51 next    1.1;
  52
  53 1.1
  54 date    2010.05.04.02.29.16;    author rusty;   state Exp;
  55 branches;
  56 next    ;
  57
  58
  59 desc
  60 @First draft
  61 @
  62
  63
  64 1.10
  65 log
  66 @Tracing attribute, talloc support.
  67 @
  68 text
  69 @#LyX 1.6.5 created this file. For more info see http://www.lyx.org/
  70 \lyxformat 345
  71 \begin_document
  72 \begin_header
  73 \textclass article
  74 \use_default_options true
  75 \language english
  76 \inputencoding auto
  77 \font_roman default
  78 \font_sans default
  79 \font_typewriter default
  80 \font_default_family default
  81 \font_sc false
  82 \font_osf false
  83 \font_sf_scale 100
  84 \font_tt_scale 100
  85
  86 \graphics default
  87 \paperfontsize default
  88 \use_hyperref false
  89 \papersize default
  90 \use_geometry false
  91 \use_amsmath 1
  92 \use_esint 1
  93 \cite_engine basic
  94 \use_bibtopic false
  95 \paperorientation portrait
  96 \secnumdepth 3
  97 \tocdepth 3
  98 \paragraph_separation indent
  99 \defskip medskip
 100 \quotes_language english
 101 \papercolumns 1
 102 \papersides 1
 103 \paperpagestyle default
 104 \tracking_changes true
 105 \output_changes true
 106 \author "Rusty Russell,,,"
 107 \author ""
 108 \end_header
 109
 110 \begin_body
 111
 112 \begin_layout Title
 113 TDB2: A Redesigning The Trivial DataBase
 114 \end_layout
 115
 116 \begin_layout Author
 117 Rusty Russell, IBM Corporation
 118 \end_layout
 119
 120 \begin_layout Date
 121
 122 \change_deleted 0 1283307542
 123 26-July
 124 \change_inserted 0 1284423485
 125 14-September
 126 \change_unchanged
 127 -2010
 128 \end_layout
 129
 130 \begin_layout Abstract
 131 The Trivial DataBase on-disk format is 32 bits; with usage cases heading
 132  towards the 4G limit, that must change.
 133  This required breakage provides an opportunity to revisit TDB's other design
 134  decisions and reassess them.
 135 \end_layout
 136
 137 \begin_layout Section
 138 Introduction
 139 \end_layout
 140
 141 \begin_layout Standard
 142 The Trivial DataBase was originally written by Andrew Tridgell as a simple
 143  key/data pair storage system with the same API as dbm, but allowing multiple
 144  readers and writers while being small enough (< 1000 lines of C) to include
 145  in SAMBA.
 146  The simple design created in 1999 has proven surprisingly robust and performant
 147 , used in Samba versions 3 and 4 as well as numerous other projects.
 148  Its useful life was greatly increased by the (backwards-compatible!) addition
 149  of transaction support in 2005.
 150 \end_layout
 151
 152 \begin_layout Standard
 153 The wider variety and greater demands of TDB-using code has lead to some
 154  organic growth of the API, as well as some compromises on the implementation.
 155  None of these, by themselves, are seen as show-stoppers, but the cumulative
 156  effect is to a loss of elegance over the initial, simple TDB implementation.
 157  Here is a table of the approximate number of lines of implementation code
 158  and number of API functions at the end of each year:
 159 \end_layout
 160
 161 \begin_layout Standard
 162 \begin_inset Tabular
 163 <lyxtabular version="3" rows="12" columns="3">
 164 <features>
 165 <column alignment="center" valignment="top" width="0">
 166 <column alignment="center" valignment="top" width="0">
 167 <column alignment="center" valignment="top" width="0">
 168 <row>
 169 <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
 170 \begin_inset Text
 171
 172 \begin_layout Plain Layout
 173 Year End
 174 \end_layout
 175
 176 \end_inset
 177 </cell>
 178 <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
 179 \begin_inset Text
 180
 181 \begin_layout Plain Layout
 182 API Functions
 183 \end_layout
 184
 185 \end_inset
 186 </cell>
 187 <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
 188 \begin_inset Text
 189
 190 \begin_layout Plain Layout
 191 Lines of C Code Implementation
 192 \end_layout
 193
 194 \end_inset
 195 </cell>
 196 </row>
 197 <row>
 198 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 199 \begin_inset Text
 200
 201 \begin_layout Plain Layout
 202 1999
 203 \end_layout
 204
 205 \end_inset
 206 </cell>
 207 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 208 \begin_inset Text
 209
 210 \begin_layout Plain Layout
 211 13
 212 \end_layout
 213
 214 \end_inset
 215 </cell>
 216 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 217 \begin_inset Text
 218
 219 \begin_layout Plain Layout
 220 1195
 221 \end_layout
 222
 223 \end_inset
 224 </cell>
 225 </row>
 226 <row>
 227 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 228 \begin_inset Text
 229
 230 \begin_layout Plain Layout
 231 2000
 232 \end_layout
 233
 234 \end_inset
 235 </cell>
 236 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 237 \begin_inset Text
 238
 239 \begin_layout Plain Layout
 240 24
 241 \end_layout
 242
 243 \end_inset
 244 </cell>
 245 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 246 \begin_inset Text
 247
 248 \begin_layout Plain Layout
 249 1725
 250 \end_layout
 251
 252 \end_inset
 253 </cell>
 254 </row>
 255 <row>
 256 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 257 \begin_inset Text
 258
 259 \begin_layout Plain Layout
 260 2001
 261 \end_layout
 262
 263 \end_inset
 264 </cell>
 265 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 266 \begin_inset Text
 267
 268 \begin_layout Plain Layout
 269 32
 270 \end_layout
 271
 272 \end_inset
 273 </cell>
 274 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 275 \begin_inset Text
 276
 277 \begin_layout Plain Layout
 278 2228
 279 \end_layout
 280
 281 \end_inset
 282 </cell>
 283 </row>
 284 <row>
 285 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 286 \begin_inset Text
 287
 288 \begin_layout Plain Layout
 289 2002
 290 \end_layout
 291
 292 \end_inset
 293 </cell>
 294 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 295 \begin_inset Text
 296
 297 \begin_layout Plain Layout
 298 35
 299 \end_layout
 300
 301 \end_inset
 302 </cell>
 303 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 304 \begin_inset Text
 305
 306 \begin_layout Plain Layout
 307 2481
 308 \end_layout
 309
 310 \end_inset
 311 </cell>
 312 </row>
 313 <row>
 314 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 315 \begin_inset Text
 316
 317 \begin_layout Plain Layout
 318 2003
 319 \end_layout
 320
 321 \end_inset
 322 </cell>
 323 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 324 \begin_inset Text
 325
 326 \begin_layout Plain Layout
 327 35
 328 \end_layout
 329
 330 \end_inset
 331 </cell>
 332 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 333 \begin_inset Text
 334
 335 \begin_layout Plain Layout
 336 2552
 337 \end_layout
 338
 339 \end_inset
 340 </cell>
 341 </row>
 342 <row>
 343 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 344 \begin_inset Text
 345
 346 \begin_layout Plain Layout
 347 2004
 348 \end_layout
 349
 350 \end_inset
 351 </cell>
 352 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 353 \begin_inset Text
 354
 355 \begin_layout Plain Layout
 356 40
 357 \end_layout
 358
 359 \end_inset
 360 </cell>
 361 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 362 \begin_inset Text
 363
 364 \begin_layout Plain Layout
 365 2584
 366 \end_layout
 367
 368 \end_inset
 369 </cell>
 370 </row>
 371 <row>
 372 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 373 \begin_inset Text
 374
 375 \begin_layout Plain Layout
 376 2005
 377 \end_layout
 378
 379 \end_inset
 380 </cell>
 381 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 382 \begin_inset Text
 383
 384 \begin_layout Plain Layout
 385 38
 386 \end_layout
 387
 388 \end_inset
 389 </cell>
 390 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 391 \begin_inset Text
 392
 393 \begin_layout Plain Layout
 394 2647
 395 \end_layout
 396
 397 \end_inset
 398 </cell>
 399 </row>
 400 <row>
 401 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 402 \begin_inset Text
 403
 404 \begin_layout Plain Layout
 405 2006
 406 \end_layout
 407
 408 \end_inset
 409 </cell>
 410 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 411 \begin_inset Text
 412
 413 \begin_layout Plain Layout
 414 52
 415 \end_layout
 416
 417 \end_inset
 418 </cell>
 419 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 420 \begin_inset Text
 421
 422 \begin_layout Plain Layout
 423 3754
 424 \end_layout
 425
 426 \end_inset
 427 </cell>
 428 </row>
 429 <row>
 430 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 431 \begin_inset Text
 432
 433 \begin_layout Plain Layout
 434 2007
 435 \end_layout
 436
 437 \end_inset
 438 </cell>
 439 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 440 \begin_inset Text
 441
 442 \begin_layout Plain Layout
 443 66
 444 \end_layout
 445
 446 \end_inset
 447 </cell>
 448 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 449 \begin_inset Text
 450
 451 \begin_layout Plain Layout
 452 4398
 453 \end_layout
 454
 455 \end_inset
 456 </cell>
 457 </row>
 458 <row>
 459 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 460 \begin_inset Text
 461
 462 \begin_layout Plain Layout
 463 2008
 464 \end_layout
 465
 466 \end_inset
 467 </cell>
 468 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 469 \begin_inset Text
 470
 471 \begin_layout Plain Layout
 472 71
 473 \end_layout
 474
 475 \end_inset
 476 </cell>
 477 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 478 \begin_inset Text
 479
 480 \begin_layout Plain Layout
 481 4768
 482 \end_layout
 483
 484 \end_inset
 485 </cell>
 486 </row>
 487 <row>
 488 <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
 489 \begin_inset Text
 490
 491 \begin_layout Plain Layout
 492 2009
 493 \end_layout
 494
 495 \end_inset
 496 </cell>
 497 <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
 498 \begin_inset Text
 499
 500 \begin_layout Plain Layout
 501 73
 502 \end_layout
 503
 504 \end_inset
 505 </cell>
 506 <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
 507 \begin_inset Text
 508
 509 \begin_layout Plain Layout
 510 5715
 511 \end_layout
 512
 513 \end_inset
 514 </cell>
 515 </row>
 516 </lyxtabular>
 517
 518 \end_inset
 519
 520
 521 \end_layout
 522
 523 \begin_layout Standard
 524 This review is an attempt to catalog and address all the known issues with
 525  TDB and create solutions which address the problems without significantly
 526  increasing complexity; all involved are far too aware of the dangers of
 527  second system syndrome in rewriting a successful project like this.
 528 \end_layout
 529
 530 \begin_layout Section
 531 API Issues
 532 \end_layout
 533
 534 \begin_layout Subsection
 535 tdb_open_ex Is Not Expandable
 536 \end_layout
 537
 538 \begin_layout Standard
 539 The tdb_open() call was expanded to tdb_open_ex(), which added an optional
 540  hashing function and an optional logging function argument.
 541  Additional arguments to open would require the introduction of a tdb_open_ex2
 542  call etc.
 543 \end_layout
 544
 545 \begin_layout Subsubsection
 546 Proposed Solution
 547 \change_inserted 0 1284422789
 548
 549 \begin_inset CommandInset label
 550 LatexCommand label
 551 name "attributes"
 552
 553 \end_inset
 554
 555
 556 \change_unchanged
 557
 558 \end_layout
 559
 560 \begin_layout Standard
 561 tdb_open() will take a linked-list of attributes:
 562 \end_layout
 563
 564 \begin_layout LyX-Code
 565 enum tdb_attribute {
 566 \end_layout
 567
 568 \begin_layout LyX-Code
 569     TDB_ATTRIBUTE_LOG = 0,
 570 \end_layout
 571
 572 \begin_layout LyX-Code
 573     TDB_ATTRIBUTE_HASH = 1
 574 \end_layout
 575
 576 \begin_layout LyX-Code
 577 };
 578 \end_layout
 579
 580 \begin_layout LyX-Code
 581 struct tdb_attribute_base {
 582 \end_layout
 583
 584 \begin_layout LyX-Code
 585     enum tdb_attribute attr;
 586 \end_layout
 587
 588 \begin_layout LyX-Code
 589     union tdb_attribute *next;
 590 \end_layout
 591
 592 \begin_layout LyX-Code
 593 };
 594 \end_layout
 595
 596 \begin_layout LyX-Code
 597 struct tdb_attribute_log {
 598 \end_layout
 599
 600 \begin_layout LyX-Code
 601     struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
 602 \end_layout
 603
 604 \begin_layout LyX-Code
 605     tdb_log_func log_fn;
 606 \end_layout
 607
 608 \begin_layout LyX-Code
 609     void *log_private;
 610 \end_layout
 611
 612 \begin_layout LyX-Code
 613 };
 614 \end_layout
 615
 616 \begin_layout LyX-Code
 617 struct tdb_attribute_hash {
 618 \end_layout
 619
 620 \begin_layout LyX-Code
 621     struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
 622 \end_layout
 623
 624 \begin_layout LyX-Code
 625     tdb_hash_func hash_fn;
 626 \end_layout
 627
 628 \begin_layout LyX-Code
 629     void *hash_private;
 630 \end_layout
 631
 632 \begin_layout LyX-Code
 633 };
 634 \end_layout
 635
 636 \begin_layout LyX-Code
 637 union tdb_attribute {
 638 \end_layout
 639
 640 \begin_layout LyX-Code
 641     struct tdb_attribute_base base;
 642 \end_layout
 643
 644 \begin_layout LyX-Code
 645     struct tdb_attribute_log log;
 646 \end_layout
 647
 648 \begin_layout LyX-Code
 649     struct tdb_attribute_hash hash;
 650 \end_layout
 651
 652 \begin_layout LyX-Code
 653 };
 654 \end_layout
 655
 656 \begin_layout Standard
 657 This allows future attributes to be added, even if this expands the size
 658  of the union.
 659 \end_layout
 660
 661 \begin_layout Subsection
 662 tdb_traverse Makes Impossible Guarantees
 663 \end_layout
 664
 665 \begin_layout Standard
 666 tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it
 667  was thought that it was important to guarantee that all records which exist
 668  at the start and end of the traversal would be included, and no record
 669  would be included twice.
 670 \end_layout
 671
 672 \begin_layout Standard
 673 This adds complexity (see
 674 \begin_inset CommandInset ref
 675 LatexCommand ref
 676 reference "Reliable-Traversal-Adds"
 677
 678 \end_inset
 679
 680 ) and does not work anyway for records which are altered (in particular,
 681  those which are expanded may be effectively deleted and re-added behind
 682  the traversal).
 683 \end_layout
 684
 685 \begin_layout Subsubsection
 686 \begin_inset CommandInset label
 687 LatexCommand label
 688 name "traverse-Proposed-Solution"
 689
 690 \end_inset
 691
 692 Proposed Solution
 693 \end_layout
 694
 695 \begin_layout Standard
 696 Abandon the guarantee.
 697  You will see every record if no changes occur during your traversal, otherwise
 698  you will see some subset.
 699  You can prevent changes by using a transaction or the locking API.
 700 \end_layout
 701
 702 \begin_layout Subsection
 703 Nesting of Transactions Is Fraught
 704 \end_layout
 705
 706 \begin_layout Standard
 707 TDB has alternated between allowing nested transactions and not allowing
 708  them.
 709  Various paths in the Samba codebase assume that transactions will nest,
 710  and in a sense they can: the operation is only committed to disk when the
 711  outer transaction is committed.
 712  There are two problems, however:
 713 \end_layout
 714
 715 \begin_layout Enumerate
 716 Canceling the inner transaction will cause the outer transaction commit
 717  to fail, and will not undo any operations since the inner transaction began.
 718  This problem is soluble with some additional internal code.
 719 \end_layout
 720
 721 \begin_layout Enumerate
 722 An inner transaction commit can be cancelled by the outer transaction.
 723  This is desirable in the way which Samba's database initialization code
 724  uses transactions, but could be a surprise to any users expecting a successful
 725  transaction commit to expose changes to others.
 726 \end_layout
 727
 728 \begin_layout Standard
 729 The current solution is to specify the behavior at tdb_open(), with the
 730  default currently that nested transactions are allowed.
 731  This flag can also be changed at runtime.
 732 \end_layout
 733
 734 \begin_layout Subsubsection
 735 Proposed Solution
 736 \end_layout
 737
 738 \begin_layout Standard
 739 Given the usage patterns, it seems that the
 740 \begin_inset Quotes eld
 741 \end_inset
 742
 743 least-surprise
 744 \begin_inset Quotes erd
 745 \end_inset
 746
 747  behavior of disallowing nested transactions should become the default.
 748  Additionally, it seems the outer transaction is the only code which knows
 749  whether inner transactions should be allowed, so a flag to indicate this
 750  could be added to tdb_transaction_start.
 751  However, this behavior can be simulated with a wrapper which uses tdb_add_flags
 752 () and tdb_remove_flags(), so the API should not be expanded for this relatively
 753 -obscure case.
 754 \end_layout
 755
 756 \begin_layout Subsection
 757 Incorrect Hash Function is Not Detected
 758 \end_layout
 759
 760 \begin_layout Standard
 761 tdb_open_ex() allows the calling code to specify a different hash function
 762  to use, but does not check that all other processes accessing this tdb
 763  are using the same hash function.
 764  The result is that records are missing from tdb_fetch().
 765 \end_layout
 766
 767 \begin_layout Subsubsection
 768 Proposed Solution
 769 \end_layout
 770
 771 \begin_layout Standard
 772 The header should contain an example hash result (eg.
 773  the hash of 0xdeadbeef), and tdb_open_ex() should check that the given
 774  hash function produces the same answer, or fail the tdb_open call.
 775 \end_layout
 776
 777 \begin_layout Subsection
 778 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
 779 \end_layout
 780
 781 \begin_layout Standard
 782 In response to scalability issues with the free list (
 783 \begin_inset CommandInset ref
 784 LatexCommand ref
 785 reference "TDB-Freelist-Is"
 786
 787 \end_inset
 788
 789 ) two API workarounds have been incorporated in TDB: tdb_set_max_dead()
 790  and the TDB_VOLATILE flag to tdb_open.
 791  The latter actually calls the former with an argument of
 792 \begin_inset Quotes eld
 793 \end_inset
 794
 795 5
 796 \begin_inset Quotes erd
 797 \end_inset
 798
 799 .
 800 \end_layout
 801
 802 \begin_layout Standard
 803 This code allows deleted records to accumulate without putting them in the
 804  free list.
 805  On delete we iterate through each chain and free them in a batch if there
 806  are more than max_dead entries.
 807  These are never otherwise recycled except as a side-effect of a tdb_repack.
 808 \end_layout
 809
 810 \begin_layout Subsubsection
 811 Proposed Solution
 812 \end_layout
 813
 814 \begin_layout Standard
 815 With the scalability problems of the freelist solved, this API can be removed.
 816  The TDB_VOLATILE flag may still be useful as a hint that store and delete
 817  of records will be at least as common as fetch in order to allow some internal
 818  tuning, but initially will become a no-op.
 819 \end_layout
 820
 821 \begin_layout Subsection
 822 \begin_inset CommandInset label
 823 LatexCommand label
 824 name "TDB-Files-Cannot"
 825
 826 \end_inset
 827
 828 TDB Files Cannot Be Opened Multiple Times In The Same Process
 829 \end_layout
 830
 831 \begin_layout Standard
 832 No process can open the same TDB twice; we check and disallow it.
 833  This is an unfortunate side-effect of fcntl locks, which operate on a per-file
 834  rather than per-file-descriptor basis, and do not nest.
 835  Thus, closing any file descriptor on a file clears all the locks obtained
 836  by this process, even if they were placed using a different file descriptor!
 837 \end_layout
 838
 839 \begin_layout Standard
 840 Note that even if this were solved, deadlock could occur if operations were
 841  nested: this is a more manageable programming error in most cases.
 842 \end_layout
 843
 844 \begin_layout Subsubsection
 845 Proposed Solution
 846 \end_layout
 847
 848 \begin_layout Standard
 849 We could lobby POSIX to fix the perverse rules, or at least lobby Linux
 850  to violate them so that the most common implementation does not have this
 851  restriction.
 852  This would be a generally good idea for other fcntl lock users.
 853 \end_layout
 854
 855 \begin_layout Standard
 856 Samba uses a wrapper which hands out the same tdb_context to multiple callers
 857  if this happens, and does simple reference counting.
 858  We should do this inside the tdb library, which already emulates lock nesting
 859  internally; it would need to recognize when deadlock occurs within a single
 860  process.
 861  This would create a new failure mode for tdb operations (while we currently
 862  handle locking failures, they are impossible in normal use and a process
 863  encountering them can do little but give up).
 864 \end_layout
 865
 866 \begin_layout Standard
 867 I do not see benefit in an additional tdb_open flag to indicate whether
 868  re-opening is allowed, as though there may be some benefit to adding a
 869  call to detect when a tdb_context is shared, to allow other to create such
 870  an API.
 871 \end_layout
 872
 873 \begin_layout Subsection
 874 TDB API Is Not POSIX Thread-safe
 875 \end_layout
 876
 877 \begin_layout Standard
 878 The TDB API uses an error code which can be queried after an operation to
 879  determine what went wrong.
 880  This programming model does not work with threads, unless specific additional
 881  guarantees are given by the implementation.
 882  In addition, even otherwise-independent threads cannot open the same TDB
 883  (as in
 884 \begin_inset CommandInset ref
 885 LatexCommand ref
 886 reference "TDB-Files-Cannot"
 887
 888 \end_inset
 889
 890 ).
 891 \end_layout
 892
 893 \begin_layout Subsubsection
 894 Proposed Solution
 895 \end_layout
 896
 897 \begin_layout Standard
 898 Reachitecting the API to include a tdb_errcode pointer would be a great
 899  deal of churn; we are better to guarantee that the tdb_errcode is per-thread
 900  so the current programming model can be maintained.
 901 \end_layout
 902
 903 \begin_layout Standard
 904 This requires dynamic per-thread allocations, which is awkward with POSIX
 905  threads (pthread_key_create space is limited and we cannot simply allocate
 906  a key for every TDB).
 907 \end_layout
 908
 909 \begin_layout Standard
 910 Internal locking is required to make sure that fcntl locks do not overlap
 911  between threads, and also that the global list of tdbs is maintained.
 912 \end_layout
 913
 914 \begin_layout Standard
 915 The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
 916  version of the library, and otherwise no overhead will exist.
 917
 918 \change_inserted 0 1284016998
 919  Alternatively, a hooking mechanism similar to that proposed for
 920 \begin_inset CommandInset ref
 921 LatexCommand ref
 922 reference "Proposed-Solution-locking-hook"
 923
 924 \end_inset
 925
 926  could be used to enable pthread locking at runtime.
 927 \change_unchanged
 928
 929 \end_layout
 930
 931 \begin_layout Subsection
 932 *_nonblock Functions And *_mark Functions Expose Implementation
 933 \end_layout
 934
 935 \begin_layout Standard
 936 CTDB
 937 \begin_inset Foot
 938 status collapsed
 939
 940 \begin_layout Plain Layout
 941 Clustered TDB, see http://ctdb.samba.org
 942 \end_layout
 943
 944 \end_inset
 945
 946  wishes to operate on TDB in a non-blocking manner.
 947  This is currently done as follows:
 948 \end_layout
 949
 950 \begin_layout Enumerate
 951 Call the _nonblock variant of an API function (eg.
 952  tdb_lockall_nonblock).
 953  If this fails:
 954 \end_layout
 955
 956 \begin_layout Enumerate
 957 Fork a child process, and wait for it to call the normal variant (eg.
 958  tdb_lockall).
 959 \end_layout
 960
 961 \begin_layout Enumerate
 962 If the child succeeds, call the _mark variant to indicate we already have
 963  the locks (eg.
 964  tdb_lockall_mark).
 965 \end_layout
 966
 967 \begin_layout Enumerate
 968 Upon completion, tell the child to release the locks (eg.
 969  tdb_unlockall).
 970 \end_layout
 971
 972 \begin_layout Enumerate
 973 Indicate to tdb that it should consider the locks removed (eg.
 974  tdb_unlockall_mark).
 975 \end_layout
 976
 977 \begin_layout Standard
 978 There are several issues with this approach.
 979  Firstly, adding two new variants of each function clutters the API for
 980  an obscure use, and so not all functions have three variants.
 981  Secondly, it assumes that all paths of the functions ask for the same locks,
 982  otherwise the parent process will have to get a lock which the child doesn't
 983  have under some circumstances.
 984  I don't believe this is currently the case, but it constrains the implementatio
 985 n.
 986
 987 \end_layout
 988
 989 \begin_layout Subsubsection
 990 \begin_inset CommandInset label
 991 LatexCommand label
 992 name "Proposed-Solution-locking-hook"
 993
 994 \end_inset
 995
 996 Proposed Solution
 997 \end_layout
 998
 999 \begin_layout Standard
1000 Implement a hook for locking methods, so that the caller can control the
1001  calls to create and remove fcntl locks.
1002  In this scenario, ctdbd would operate as follows:
1003 \end_layout
1004
1005 \begin_layout Enumerate
1006 Call the normal API function, eg tdb_lockall().
1007 \end_layout
1008
1009 \begin_layout Enumerate
1010 When the lock callback comes in, check if the child has the lock.
1011  Initially, this is always false.
1012  If so, return 0.
1013  Otherwise, try to obtain it in non-blocking mode.
1014  If that fails, return EWOULDBLOCK.
1015 \end_layout
1016
1017 \begin_layout Enumerate
1018 Release locks in the unlock callback as normal.
1019 \end_layout
1020
1021 \begin_layout Enumerate
1022 If tdb_lockall() fails, see if we recorded a lock failure; if so, call the
1023  child to repeat the operation.
1024 \end_layout
1025
1026 \begin_layout Enumerate
1027 The child records what locks it obtains, and returns that information to
1028  the parent.
1029 \end_layout
1030
1031 \begin_layout Enumerate
1032 When the child has succeeded, goto 1.
1033 \end_layout
1034
1035 \begin_layout Standard
1036 This is flexible enough to handle any potential locking scenario, even when
1037  lock requirements change.
1038  It can be optimized so that the parent does not release locks, just tells
1039  the child which locks it doesn't need to obtain.
1040 \end_layout
1041
1042 \begin_layout Standard
1043 It also keeps the complexity out of the API, and in ctdbd where it is needed.
1044 \end_layout
1045
1046 \begin_layout Subsection
1047 tdb_chainlock Functions Expose Implementation
1048 \end_layout
1049
1050 \begin_layout Standard
1051 tdb_chainlock locks some number of records, including the record indicated
1052  by the given key.
1053  This gave atomicity guarantees; no-one can start a transaction, alter,
1054  read or delete that key while the lock is held.
1055 \end_layout
1056
1057 \begin_layout Standard
1058 It also makes the same guarantee for any other key in the chain, which is
1059  an internal implementation detail and potentially a cause for deadlock.
1060 \end_layout
1061
1062 \begin_layout Subsubsection
1063 Proposed Solution
1064 \end_layout
1065
1066 \begin_layout Standard
1067 None.
1068  It would be nice to have an explicit single entry lock which effected no
1069  other keys.
1070  Unfortunately, this won't work for an entry which doesn't exist.
1071  Thus while chainlock may be implemented more efficiently for the existing
1072  case, it will still have overlap issues with the non-existing case.
1073  So it is best to keep the current (lack of) guarantee about which records
1074  will be effected to avoid constraining our implementation.
1075 \end_layout
1076
1077 \begin_layout Subsection
1078 Signal Handling is Not Race-Free
1079 \end_layout
1080
1081 \begin_layout Standard
1082 The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate
1083  that the tdb locking code should return with a failure, rather than trying
1084  again when a signal is received (and errno == EAGAIN).
1085  This is usually used to implement timeouts.
1086 \end_layout
1087
1088 \begin_layout Standard
1089 Unfortunately, this does not work in the case where the signal is received
1090  before the tdb code enters the fcntl() call to place the lock: the code
1091  will sleep within the fcntl() code, unaware that the signal wants it to
1092  exit.
1093  In the case of long timeouts, this does not happen in practice.
1094 \end_layout
1095
1096 \begin_layout Subsubsection
1097 Proposed Solution
1098 \end_layout
1099
1100 \begin_layout Standard
1101 The locking hooks proposed in
1102 \begin_inset CommandInset ref
1103 LatexCommand ref
1104 reference "Proposed-Solution-locking-hook"
1105
1106 \end_inset
1107
1108  would allow the user to decide on whether to fail the lock acquisition
1109  on a signal.
1110  This allows the caller to choose their own compromise: they could narrow
1111  the race by checking immediately before the fcntl call.
1112 \begin_inset Foot
1113 status collapsed
1114
1115 \begin_layout Plain Layout
1116 It may be possible to make this race-free in some implementations by having
1117  the signal handler alter the struct flock to make it invalid.
1118  This will cause the fcntl() lock call to fail with EINVAL if the signal
1119  occurs before the kernel is entered, otherwise EAGAIN.
1120 \end_layout
1121
1122 \end_inset
1123
1124
1125 \end_layout
1126
1127 \begin_layout Subsection
1128 The API Uses Gratuitous Typedefs, Capitals
1129 \end_layout
1130
1131 \begin_layout Standard
1132 typedefs are useful for providing source compatibility when types can differ
1133  across implementations, or arguably in the case of function pointer definitions
1134  which are hard for humans to parse.
1135  Otherwise it is simply obfuscation and pollutes the namespace.
1136 \end_layout
1137
1138 \begin_layout Standard
1139 Capitalization is usually reserved for compile-time constants and macros.
1140 \end_layout
1141
1142 \begin_layout Description
1143 TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the
1144  definition isn't visible to the API user anyway.
1145 \end_layout
1146
1147 \begin_layout Description
1148 TDB_DATA There is no reason to use this over struct TDB_DATA; the struct
1149  needs to be understood by the API user.
1150 \end_layout
1151
1152 \begin_layout Description
1153 struct
1154 \begin_inset space ~
1155 \end_inset
1156
1157 TDB_DATA This would normally be called 'struct tdb_data'.
1158 \end_layout
1159
1160 \begin_layout Description
1161 enum
1162 \begin_inset space ~
1163 \end_inset
1164
1165 TDB_ERROR Similarly, this would normally be enum tdb_error.
1166 \end_layout
1167
1168 \begin_layout Subsubsection
1169 Proposed Solution
1170 \end_layout
1171
1172 \begin_layout Standard
1173 None.
1174  Introducing lower case variants would please pedants like myself, but if
1175  it were done the existing ones should be kept.
1176  There is little point forcing a purely cosmetic change upon tdb users.
1177 \end_layout
1178
1179 \begin_layout Subsection
1180 \begin_inset CommandInset label
1181 LatexCommand label
1182 name "tdb_log_func-Doesnt-Take"
1183
1184 \end_inset
1185
1186 tdb_log_func Doesn't Take The Private Pointer
1187 \end_layout
1188
1189 \begin_layout Standard
1190 For API compatibility reasons, the logging function needs to call tdb_get_loggin
1191 g_private() to retrieve the pointer registered by the tdb_open_ex for logging.
1192 \end_layout
1193
1194 \begin_layout Subsubsection
1195 Proposed Solution
1196 \end_layout
1197
1198 \begin_layout Standard
1199 It should simply take an extra argument, since we are prepared to break
1200  the API/ABI.
1201 \end_layout
1202
1203 \begin_layout Subsection
1204 Various Callback Functions Are Not Typesafe
1205 \end_layout
1206
1207 \begin_layout Standard
1208 The callback functions in tdb_set_logging_function (after
1209 \begin_inset CommandInset ref
1210 LatexCommand ref
1211 reference "tdb_log_func-Doesnt-Take"
1212
1213 \end_inset
1214
1215  is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check
1216  all take void * and must internally convert it to the argument type they
1217  were expecting.
1218 \end_layout
1219
1220 \begin_layout Standard
1221 If this type changes, the compiler will not produce warnings on the callers,
1222  since it only sees void *.
1223 \end_layout
1224
1225 \begin_layout Subsubsection
1226 Proposed Solution
1227 \end_layout
1228
1229 \begin_layout Standard
1230 With careful use of macros, we can create callback functions which give
1231  a warning when used on gcc and the types of the callback and its private
1232  argument differ.
1233  Unsupported compilers will not give a warning, which is no worse than now.
1234  In addition, the callbacks become clearer, as they need not use void *
1235  for their parameter.
1236 \end_layout
1237
1238 \begin_layout Standard
1239 See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
1240 \end_layout
1241
1242 \begin_layout Subsection
1243 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
1244 \end_layout
1245
1246 \begin_layout Standard
1247 The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should
1248  be cleared if the caller discovers it is the only process with the TDB
1249  open.
1250  However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not
1251  be detected, so will have the TDB erased underneath them (usually resulting
1252  in a crash).
1253 \end_layout
1254
1255 \begin_layout Standard
1256 There is a similar issue on fork(); if the parent exits (or otherwise closes
1257  the tdb) before the child calls tdb_reopen_all() to establish the lock
1258  used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener
1259  at that moment will believe it alone has opened the TDB and will erase
1260  it.
1261 \end_layout
1262
1263 \begin_layout Subsubsection
1264 Proposed Solution
1265 \end_layout
1266
1267 \begin_layout Standard
1268 Remove TDB_CLEAR_IF_FIRST.
1269  Other workarounds are possible, but see
1270 \begin_inset CommandInset ref
1271 LatexCommand ref
1272 reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
1273
1274 \end_inset
1275
1276 .
1277 \change_inserted 0 1284015637
1278
1279 \end_layout
1280
1281 \begin_layout Subsection
1282
1283 \change_inserted 0 1284015716
1284 Extending The Header Is Difficult
1285 \end_layout
1286
1287 \begin_layout Standard
1288
1289 \change_inserted 0 1284015906
1290 We have reserved (zeroed) words in the TDB header, which can be used for
1291  future features.
1292  If the future features are compulsory, the version number must be updated
1293  to prevent old code from accessing the database.
1294  But if the future feature is optional, we have no way of telling if older
1295  code is accessing the database or not.
1296 \end_layout
1297
1298 \begin_layout Subsubsection
1299
1300 \change_inserted 0 1284015637
1301 Proposed Solution
1302 \end_layout
1303
1304 \begin_layout Standard
1305
1306 \change_inserted 0 1284016114
1307 The header should contain a
1308 \begin_inset Quotes eld
1309 \end_inset
1310
1311 format variant
1312 \begin_inset Quotes erd
1313 \end_inset
1314
1315  value (64-bit).
1316  This is divided into two 32-bit parts:
1317 \end_layout
1318
1319 \begin_layout Enumerate
1320
1321 \change_inserted 0 1284016149
1322 The lower part reflects the format variant understood by code accessing
1323  the database.
1324 \end_layout
1325
1326 \begin_layout Enumerate
1327
1328 \change_inserted 0 1284016639
1329 The upper part reflects the format variant you must understand to write
1330  to the database (otherwise you can only open for reading).
1331 \end_layout
1332
1333 \begin_layout Standard
1334
1335 \change_inserted 0 1284016821
1336 The latter field can only be written at creation time, the former should
1337  be written under the OPEN_LOCK when opening the database for writing, if
1338  the variant of the code is lower than the current lowest variant.
1339 \end_layout
1340
1341 \begin_layout Standard
1342
1343 \change_inserted 0 1284016803
1344 This should allow backwards-compatible features to be added, and detection
1345  if older code (which doesn't understand the feature) writes to the database.
1346 \change_deleted 0 1284016101
1347
1348 \end_layout
1349
1350 \begin_layout Subsection
1351
1352 \change_inserted 0 1284015634
1353 Record Headers Are Not Expandible
1354 \end_layout
1355
1356 \begin_layout Standard
1357
1358 \change_inserted 0 1284015634
1359 If we later want to add (say) checksums on keys and data, it would require
1360  another format change, which we'd like to avoid.
1361 \end_layout
1362
1363 \begin_layout Subsubsection
1364
1365 \change_inserted 0 1284015634
1366 Proposed Solution
1367 \end_layout
1368
1369 \begin_layout Standard
1370
1371 \change_inserted 0 1284422552
1372 We often have extra padding at the tail of a record.
1373  If we ensure that the first byte (if any) of this padding is zero, we will
1374  have a way for future changes to detect code which doesn't understand a
1375  new format: the new code would write (say) a 1 at the tail, and thus if
1376  there is no tail or the first byte is 0, we would know the extension is
1377  not present on that record.
1378 \end_layout
1379
1380 \begin_layout Subsection
1381
1382 \change_inserted 0 1284422568
1383 TDB Does Not Use Talloc
1384 \end_layout
1385
1386 \begin_layout Standard
1387
1388 \change_inserted 0 1284422646
1389 Many users of TDB (particularly Samba) use the talloc allocator, and thus
1390  have to wrap TDB in a talloc context to use it conveniently.
1391 \end_layout
1392
1393 \begin_layout Subsubsection
1394
1395 \change_inserted 0 1284422656
1396 Proposed Solution
1397 \end_layout
1398
1399 \begin_layout Standard
1400
1401 \change_inserted 0 1284423065
1402 The allocation within TDB is not complicated enough to justify the use of
1403  talloc, and I am reluctant to force another (excellent) library on TDB
1404  users.
1405  Nonetheless a compromise is possible.
1406  An attribute (see
1407 \begin_inset CommandInset ref
1408 LatexCommand ref
1409 reference "attributes"
1410
1411 \end_inset
1412
1413 ) can be added later to tdb_open() to provide an alternate allocation mechanism,
1414  specifically for talloc but usable by any other allocator (which would
1415  ignore the
1416 \begin_inset Quotes eld
1417 \end_inset
1418
1419 context
1420 \begin_inset Quotes erd
1421 \end_inset
1422
1423  argument).
1424 \end_layout
1425
1426 \begin_layout Standard
1427
1428 \change_inserted 0 1284423042
1429 This would form a talloc heirarchy as expected, but the caller would still
1430  have to attach a destructor to the tdb context returned from tdb_open to
1431  close it.
1432  All TDB_DATA fields would be children of the tdb_context, and the caller
1433  would still have to manage them (using talloc_free() or talloc_steal()).
1434 \change_unchanged
1435
1436 \end_layout
1437
1438 \begin_layout Section
1439 Performance And Scalability Issues
1440 \end_layout
1441
1442 \begin_layout Subsection
1443 \begin_inset CommandInset label
1444 LatexCommand label
1445 name "TDB_CLEAR_IF_FIRST-Imposes-Performance"
1446
1447 \end_inset
1448
1449 TDB_CLEAR_IF_FIRST Imposes Performance Penalty
1450 \end_layout
1451
1452 \begin_layout Standard
1453 When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset
1454  4 (aka.
1455  the ACTIVE_LOCK).
1456  While these locks never conflict in normal tdb usage, they do add substantial
1457  overhead for most fcntl lock implementations when the kernel scans to detect
1458  if a lock conflict exists.
1459  This is often a single linked list, making the time to acquire and release
1460  a fcntl lock O(N) where N is the number of processes with the TDB open,
1461  not the number actually doing work.
1462 \end_layout
1463
1464 \begin_layout Standard
1465 In a Samba server it is common to have huge numbers of clients sitting idle,
1466  and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag.
1467 \begin_inset Foot
1468 status collapsed
1469
1470 \begin_layout Plain Layout
1471 There is a flag to tdb_reopen_all() which is used for this optimization:
1472  if the parent process will outlive the child, the child does not need the
1473  ACTIVE_LOCK.
1474  This is a workaround for this very performance issue.
1475 \end_layout
1476
1477 \end_inset
1478
1479
1480 \end_layout
1481
1482 \begin_layout Subsubsection
1483 Proposed Solution
1484 \end_layout
1485
1486 \begin_layout Standard
1487 Remove the flag.
1488  It was a neat idea, but even trivial servers tend to know when they are
1489  initializing for the first time and can simply unlink the old tdb at that
1490  point.
1491 \end_layout
1492
1493 \begin_layout Subsection
1494 TDB Files Have a 4G Limit
1495 \end_layout
1496
1497 \begin_layout Standard
1498 This seems to be becoming an issue (so much for
1499 \begin_inset Quotes eld
1500 \end_inset
1501
1502 trivial
1503 \begin_inset Quotes erd
1504 \end_inset
1505
1506 !), particularly for ldb.
1507 \end_layout
1508
1509 \begin_layout Subsubsection
1510 Proposed Solution
1511 \end_layout
1512
1513 \begin_layout Standard
1514 A new, incompatible TDB format which uses 64 bit offsets internally rather
1515  than 32 bit as now.
1516  For simplicity of endian conversion (which TDB does on the fly if required),
1517  all values will be 64 bit on disk.
1518  In practice, some upper bits may be used for other purposes, but at least
1519  56 bits will be available for file offsets.
1520 \end_layout
1521
1522 \begin_layout Standard
1523 tdb_open() will automatically detect the old version, and even create them
1524  if TDB_VERSION6 is specified to tdb_open.
1525 \end_layout
1526
1527 \begin_layout Standard
1528 32 bit processes will still be able to access TDBs larger than 4G (assuming
1529  that their off_t allows them to seek to 64 bits), they will gracefully
1530  fall back as they fail to mmap.
1531  This can happen already with large TDBs.
1532 \end_layout
1533
1534 \begin_layout Standard
1535 Old versions of tdb will fail to open the new TDB files (since 28 August
1536  2009, commit 398d0c29290: prior to that any unrecognized file format would
1537  be erased and initialized as a fresh tdb!)
1538 \end_layout
1539
1540 \begin_layout Subsection
1541 TDB Records Have a 4G Limit
1542 \end_layout
1543
1544 \begin_layout Standard
1545 This has not been a reported problem, and the API uses size_t which can
1546  be 64 bit on 64 bit platforms.
1547  However, other limits may have made such an issue moot.
1548 \end_layout
1549
1550 \begin_layout Subsubsection
1551 Proposed Solution
1552 \end_layout
1553
1554 \begin_layout Standard
1555 Record sizes will be 64 bit, with an error returned on 32 bit platforms
1556  which try to access such records (the current implementation would return
1557  TDB_ERR_OOM in a similar case).
1558  It seems unlikely that 32 bit keys will be a limitation, so the implementation
1559  may not support this (see
1560 \begin_inset CommandInset ref
1561 LatexCommand ref
1562 reference "sub:Records-Incur-A"
1563
1564 \end_inset
1565
1566 ).
1567 \end_layout
1568
1569 \begin_layout Subsection
1570 Hash Size Is Determined At TDB Creation Time
1571 \end_layout
1572
1573 \begin_layout Standard
1574 TDB contains a number of hash chains in the header; the number is specified
1575  at creation time, and defaults to 131.
1576  This is such a bottleneck on large databases (as each hash chain gets quite
1577  long), that LDB uses 10,000 for this hash.
1578  In general it is impossible to know what the 'right' answer is at database
1579  creation time.
1580 \end_layout
1581
1582 \begin_layout Subsubsection
1583
1584 \change_inserted 0 1283336713
1585 \begin_inset CommandInset label
1586 LatexCommand label
1587 name "sub:Hash-Size-Solution"
1588
1589 \end_inset
1590
1591
1592 \change_unchanged
1593 Proposed Solution
1594 \end_layout
1595
1596 \begin_layout Standard
1597 After comprehensive performance testing on various scalable hash variants
1598 \begin_inset Foot
1599 status collapsed
1600
1601 \begin_layout Plain Layout
1602 http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoying
1603  because I was previously convinced that an expanding tree of hashes would
1604  be very close to optimal.
1605 \end_layout
1606
1607 \end_inset
1608
1609 , it became clear that it is hard to beat a straight linear hash table which
1610  doubles in size when it reaches saturation.
1611
1612 \change_deleted 0 1283307675
1613 There are three details which become important:
1614 \end_layout
1615
1616 \begin_layout Enumerate
1617
1618 \change_deleted 0 1283307675
1619 On encountering a full bucket, we use the next bucket.
1620 \end_layout
1621
1622 \begin_layout Enumerate
1623
1624 \change_deleted 0 1283307675
1625 Extra hash bits are stored with the offset, to reduce comparisons.
1626 \end_layout
1627
1628 \begin_layout Enumerate
1629
1630 \change_deleted 0 1283307675
1631 A marker entry is used on deleting an entry.
1632 \end_layout
1633
1634 \begin_layout Standard
1635
1636 \change_deleted 0 1283307675
1637 The doubling of the table must be done under a transaction; we will not
1638  reduce it on deletion, so it will be an unusual case.
1639  It will either be placed at the head (other entries will be moved out the
1640  way so we can expand).
1641  We could have a pointer in the header to the current hashtable location,
1642  but that pointer would have to be read frequently to check for hashtable
1643  moves.
1644 \end_layout
1645
1646 \begin_layout Standard
1647
1648 \change_deleted 0 1283307675
1649 The locking for this is slightly more complex than the chained case; we
1650  currently have one lock per bucket, and that means we would need to expand
1651  the lock if we overflow to the next bucket.
1652  The frequency of such collisions will effect our locking heuristics: we
1653  can always lock more buckets than we need.
1654 \end_layout
1655
1656 \begin_layout Standard
1657
1658 \change_deleted 0 1283307675
1659 One possible optimization is to only re-check the hash size on an insert
1660  or a lookup miss.
1661
1662 \change_inserted 0 1283307770
1663  Unfortunately, altering the hash table introduces serious locking complications
1664 : the entire hash table needs to be locked to enlarge the hash table, and
1665  others might be holding locks.
1666  Particularly insidious are insertions done under tdb_chainlock.
1667 \end_layout
1668
1669 \begin_layout Standard
1670
1671 \change_inserted 0 1283336187
1672 Thus an expanding layered hash will be used: an array of hash groups, with
1673  each hash group exploding into pointers to lower hash groups once it fills,
1674  turning into a hash tree.
1675  This has implications for locking: we must lock the entire group in case
1676  we need to expand it, yet we don't know how deep the tree is at that point.
1677 \end_layout
1678
1679 \begin_layout Standard
1680
1681 \change_inserted 0 1283336586
1682 Note that bits from the hash table entries should be stolen to hold more
1683  hash bits to reduce the penalty of collisions.
1684  We can use the otherwise-unused lower 3 bits.
1685  If we limit the size of the database to 64 exabytes, we can use the top
1686  8 bits of the hash entry as well.
1687  These 11 bits would reduce false positives down to 1 in 2000 which is more
1688  than we need: we can use one of the bits to indicate that the extra hash
1689  bits are valid.
1690  This means we can choose not to re-hash all entries when we expand a hash
1691  group; simply use the next bits we need and mark them invalid.
1692 \change_unchanged
1693
1694 \end_layout
1695
1696 \begin_layout Subsection
1697 \begin_inset CommandInset label
1698 LatexCommand label
1699 name "TDB-Freelist-Is"
1700
1701 \end_inset
1702
1703 TDB Freelist Is Highly Contended
1704 \end_layout
1705
1706 \begin_layout Standard
1707 TDB uses a single linked list for the free list.
1708  Allocation occurs as follows, using heuristics which have evolved over
1709  time:
1710 \end_layout
1711
1712 \begin_layout Enumerate
1713 Get the free list lock for this whole operation.
1714 \end_layout
1715
1716 \begin_layout Enumerate
1717 Multiply length by 1.25, so we always over-allocate by 25%.
1718 \end_layout
1719
1720 \begin_layout Enumerate
1721 Set the slack multiplier to 1.
1722 \end_layout
1723
1724 \begin_layout Enumerate
1725 Examine the current freelist entry: if it is > length but < the current
1726  best case, remember it as the best case.
1727 \end_layout
1728
1729 \begin_layout Enumerate
1730 Multiply the slack multiplier by 1.05.
1731 \end_layout
1732
1733 \begin_layout Enumerate
1734 If our best fit so far is less than length * slack multiplier, return it.
1735  The slack will be turned into a new free record if it's large enough.
1736 \end_layout
1737
1738 \begin_layout Enumerate
1739 Otherwise, go onto the next freelist entry.
1740 \end_layout
1741
1742 \begin_layout Standard
1743 Deleting a record occurs as follows:
1744 \end_layout
1745
1746 \begin_layout Enumerate
1747 Lock the hash chain for this whole operation.
1748 \end_layout
1749
1750 \begin_layout Enumerate
1751 Walk the chain to find the record, keeping the prev pointer offset.
1752 \end_layout
1753
1754 \begin_layout Enumerate
1755 If max_dead is non-zero:
1756 \end_layout
1757
1758 \begin_deeper
1759 \begin_layout Enumerate
1760 Walk the hash chain again and count the dead records.
1761 \end_layout
1762
1763 \begin_layout Enumerate
1764 If it's more than max_dead, bulk free all the dead ones (similar to steps
1765  4 and below, but the lock is only obtained once).
1766 \end_layout
1767
1768 \begin_layout Enumerate
1769 Simply mark this record as dead and return.
1770
1771 \end_layout
1772
1773 \end_deeper
1774 \begin_layout Enumerate
1775 Get the free list lock for the remainder of this operation.
1776 \end_layout
1777
1778 \begin_layout Enumerate
1779 \begin_inset CommandInset label
1780 LatexCommand label
1781 name "right-merging"
1782
1783 \end_inset
1784
1785 Examine the following block to see if it is free; if so, enlarge the current
1786  block and remove that block from the free list.
1787  This was disabled, as removal from the free list was O(entries-in-free-list).
1788 \end_layout
1789
1790 \begin_layout Enumerate
1791 Examine the preceeding block to see if it is free: for this reason, each
1792  block has a 32-bit tailer which indicates its length.
1793  If it is free, expand it to cover our new block and return.
1794 \end_layout
1795
1796 \begin_layout Enumerate
1797 Otherwise, prepend ourselves to the free list.
1798 \end_layout
1799
1800 \begin_layout Standard
1801 Disabling right-merging (step
1802 \begin_inset CommandInset ref
1803 LatexCommand ref
1804 reference "right-merging"
1805
1806 \end_inset
1807
1808 ) causes fragmentation; the other heuristics proved insufficient to address
1809  this, so the final answer to this was that when we expand the TDB file
1810  inside a transaction commit, we repack the entire tdb.
1811 \end_layout
1812
1813 \begin_layout Standard
1814 The single list lock limits our allocation rate; due to the other issues
1815  this is not currently seen as a bottleneck.
1816 \end_layout
1817
1818 \begin_layout Subsubsection
1819 Proposed Solution
1820 \change_deleted 0 1283336858
1821
1822 \end_layout
1823
1824 \begin_layout Standard
1825 The first step is to remove all the current heuristics, as they obviously
1826  interact, then examine them once the lock contention is addressed.
1827 \end_layout
1828
1829 \begin_layout Standard
1830 The free list must be split to reduce contention.
1831  Assuming perfect free merging, we can at most have 1 free list entry for
1832  each entry.
1833  This implies that the number of free lists is related to the size of the
1834  hash table, but as it is rare to walk a large number of free list entries
1835  we can use far fewer, say 1/32 of the number of hash buckets.
1836 \change_inserted 0 1283336910
1837
1838 \end_layout
1839
1840 \begin_layout Standard
1841
1842 \change_inserted 0 1283337052
1843 It seems tempting to try to reuse the hash implementation which we use for
1844  records here, but we have two ways of searching for free entries: for allocatio
1845 n we search by size (and possibly zone) which produces too many clashes
1846  for our hash table to handle well, and for coalescing we search by address.
1847  Thus an array of doubly-linked free lists seems preferable.
1848 \change_unchanged
1849
1850 \end_layout
1851
1852 \begin_layout Standard
1853 There are various benefits in using per-size free lists (see
1854 \begin_inset CommandInset ref
1855 LatexCommand ref
1856 reference "sub:TDB-Becomes-Fragmented"
1857
1858 \end_inset
1859
1860 ) but it's not clear this would reduce contention in the common case where
1861  all processes are allocating/freeing the same size.
1862  Thus we almost certainly need to divide in other ways: the most obvious
1863  is to divide the file into zones, and using a free list (or set of free
1864  lists) for each.
1865  This approximates address ordering.
1866 \end_layout
1867
1868 \begin_layout Standard
1869 Note that this means we need to split the free lists when we expand the
1870  file; this is probably acceptable when we double the hash table size, since
1871  that is such an expensive operation already.
1872  In the case of increasing the file size, there is an optimization we can
1873  use: if we use M in the formula above as the file size rounded up to the
1874  next power of 2, we only need reshuffle free lists when the file size crosses
1875  a power of 2 boundary,
1876 \emph on
1877 and
1878 \emph default
1879 reshuffling the free lists is trivial: we simply merge every consecutive
1880  pair of free lists.
1881 \end_layout
1882
1883 \begin_layout Standard
1884 The basic algorithm is as follows.
1885  Freeing is simple:
1886 \end_layout
1887
1888 \begin_layout Enumerate
1889 Identify the correct zone.
1890 \end_layout
1891
1892 \begin_layout Enumerate
1893 Lock the corresponding list.
1894 \end_layout
1895
1896 \begin_layout Enumerate
1897 Re-check the zone (we didn't have a lock, sizes could have changed): relock
1898  if necessary.
1899 \end_layout
1900
1901 \begin_layout Enumerate
1902 Place the freed entry in the list for that zone.
1903 \end_layout
1904
1905 \begin_layout Standard
1906 Allocation is a little more complicated, as we perform delayed coalescing
1907  at this point:
1908 \end_layout
1909
1910 \begin_layout Enumerate
1911 Pick a zone either the zone we last freed into, or based on a
1912 \begin_inset Quotes eld
1913 \end_inset
1914
1915 random
1916 \begin_inset Quotes erd
1917 \end_inset
1918
1919  number.
1920 \end_layout
1921
1922 \begin_layout Enumerate
1923 Lock the corresponding list.
1924 \end_layout
1925
1926 \begin_layout Enumerate
1927 Re-check the zone: relock if necessary.
1928 \end_layout
1929
1930 \begin_layout Enumerate
1931 If the top entry is -large enough, remove it from the list and return it.
1932 \end_layout
1933
1934 \begin_layout Enumerate
1935 Otherwise, coalesce entries in the list.If there was no entry large enough,
1936  unlock the list and try the next zone.
1937 \end_layout
1938
1939 \begin_layout Enumerate
1940 If no zone satisfies, expand the file.
1941 \end_layout
1942
1943 \begin_layout Standard
1944 This optimizes rapid insert/delete of free list entries by not coalescing
1945  them all the time..
1946  First-fit address ordering ordering seems to be fairly good for keeping
1947  fragmentation low (see
1948 \begin_inset CommandInset ref
1949 LatexCommand ref
1950 reference "sub:TDB-Becomes-Fragmented"
1951
1952 \end_inset
1953
1954 ).
1955  Note that address ordering does not need a tailer to coalesce, though if
1956  we needed one we could have one cheaply: see
1957 \begin_inset CommandInset ref
1958 LatexCommand ref
1959 reference "sub:Records-Incur-A"
1960
1961 \end_inset
1962
1963 .
1964
1965 \end_layout
1966
1967 \begin_layout Standard
1968 I anticipate that the number of entries in each free zone would be small,
1969  but it might be worth using one free entry to hold pointers to the others
1970  for cache efficiency.
1971 \change_inserted 0 1283309850
1972
1973 \end_layout
1974
1975 \begin_layout Standard
1976
1977 \change_inserted 0 1283337216
1978 \begin_inset CommandInset label
1979 LatexCommand label
1980 name "freelist-in-zone"
1981
1982 \end_inset
1983
1984 If we want to avoid locking complexity (enlarging the free lists when we
1985  enlarge the file) we could place the array of free lists at the beginning
1986  of each zone.
1987  This means existing array lists never move, but means that a record cannot
1988  be larger than a zone.
1989  That in turn implies that zones should be variable sized (say, power of
1990  2), which makes the question
1991 \begin_inset Quotes eld
1992 \end_inset
1993
1994 what zone is this record in?
1995 \begin_inset Quotes erd
1996 \end_inset
1997
1998  much harder (and
1999 \begin_inset Quotes eld
2000 \end_inset
2001
2002 pick a random zone
2003 \begin_inset Quotes erd
2004 \end_inset
2005
2006 , but that's less common).
2007  It could be done with as few as 4 bits from the record header.
2008 \begin_inset Foot
2009 status open
2010
2011 \begin_layout Plain Layout
2012
2013 \change_inserted 0 1284424151
2014 Using
2015 \begin_inset Formula $2^{16+N*3}$
2016 \end_inset
2017
2018 means 0 gives a minimal 65536-byte zone, 15 gives the maximal
2019 \begin_inset Formula $2^{61}$
2020 \end_inset
2021
2022  byte zone.
2023  Zones range in factor of 8 steps.
2024  Given the zone size for the zone the current record is in, we can determine
2025  the start of the zone.
2026 \change_unchanged
2027
2028 \end_layout
2029
2030 \end_inset
2031
2032
2033 \change_unchanged
2034
2035 \end_layout
2036
2037 \begin_layout Subsection
2038 \begin_inset CommandInset label
2039 LatexCommand label
2040 name "sub:TDB-Becomes-Fragmented"
2041
2042 \end_inset
2043
2044 TDB Becomes Fragmented
2045 \end_layout
2046
2047 \begin_layout Standard
2048 Much of this is a result of allocation strategy
2049 \begin_inset Foot
2050 status collapsed
2051
2052 \begin_layout Plain Layout
2053 The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute
2054 xas.edu/pub/garbage/malloc/ismm98.ps
2055 \end_layout
2056
2057 \end_inset
2058
2059  and deliberate hobbling of coalescing; internal fragmentation (aka overallocati
2060 on) is deliberately set at 25%, and external fragmentation is only cured
2061  by the decision to repack the entire db when a transaction commit needs
2062  to enlarge the file.
2063 \end_layout
2064
2065 \begin_layout Subsubsection
2066 Proposed Solution
2067 \end_layout
2068
2069 \begin_layout Standard
2070 The 25% overhead on allocation works in practice for ldb because indexes
2071  tend to expand by one record at a time.
2072  This internal fragmentation can be resolved by having an
2073 \begin_inset Quotes eld
2074 \end_inset
2075
2076 expanded
2077 \begin_inset Quotes erd
2078 \end_inset
2079
2080  bit in the header to note entries that have previously expanded, and allocating
2081  more space for them.
2082 \end_layout
2083
2084 \begin_layout Standard
2085 There are is a spectrum of possible solutions for external fragmentation:
2086  one is to use a fragmentation-avoiding allocation strategy such as best-fit
2087  address-order allocator.
2088  The other end of the spectrum would be to use a bump allocator (very fast
2089  and simple) and simply repack the file when we reach the end.
2090 \end_layout
2091
2092 \begin_layout Standard
2093 There are three problems with efficient fragmentation-avoiding allocators:
2094  they are non-trivial, they tend to use a single free list for each size,
2095  and there's no evidence that tdb allocation patterns will match those recorded
2096  for general allocators (though it seems likely).
2097 \end_layout
2098
2099 \begin_layout Standard
2100 Thus we don't spend too much effort on external fragmentation; we will be
2101  no worse than the current code if we need to repack on occasion.
2102  More effort is spent on reducing freelist contention, and reducing overhead.
2103 \end_layout
2104
2105 \begin_layout Subsection
2106 \begin_inset CommandInset label
2107 LatexCommand label
2108 name "sub:Records-Incur-A"
2109
2110 \end_inset
2111
2112 Records Incur A 28-Byte Overhead
2113 \end_layout
2114
2115 \begin_layout Standard
2116 Each TDB record has a header as follows:
2117 \end_layout
2118
2119 \begin_layout LyX-Code
2120 struct tdb_record {
2121 \end_layout
2122
2123 \begin_layout LyX-Code
2124         tdb_off_t next; /* offset of the next record in the list */
2125 \end_layout
2126
2127 \begin_layout LyX-Code
2128         tdb_len_t rec_len; /* total byte length of record */
2129 \end_layout
2130
2131 \begin_layout LyX-Code
2132         tdb_len_t key_len; /* byte length of key */
2133 \end_layout
2134
2135 \begin_layout LyX-Code
2136         tdb_len_t data_len; /* byte length of data */
2137 \end_layout
2138
2139 \begin_layout LyX-Code
2140         uint32_t full_hash; /* the full 32 bit hash of the key */
2141 \end_layout
2142
2143 \begin_layout LyX-Code
2144         uint32_t magic;   /* try to catch errors */
2145 \end_layout
2146
2147 \begin_layout LyX-Code
2148         /* the following union is implied:
2149 \end_layout
2150
2151 \begin_layout LyX-Code
2152                 union {
2153 \end_layout
2154
2155 \begin_layout LyX-Code
2156                         char record[rec_len];
2157 \end_layout
2158
2159 \begin_layout LyX-Code
2160                         struct {
2161 \end_layout
2162
2163 \begin_layout LyX-Code
2164                                 char key[key_len];
2165 \end_layout
2166
2167 \begin_layout LyX-Code
2168                                 char data[data_len];
2169 \end_layout
2170
2171 \begin_layout LyX-Code
2172                         }
2173 \end_layout
2174
2175 \begin_layout LyX-Code
2176                         uint32_t totalsize; (tailer)
2177 \end_layout
2178
2179 \begin_layout LyX-Code
2180                 }
2181 \end_layout
2182
2183 \begin_layout LyX-Code
2184         */
2185 \end_layout
2186
2187 \begin_layout LyX-Code
2188 };
2189 \end_layout
2190
2191 \begin_layout Standard
2192 Naively, this would double to a 56-byte overhead on a 64 bit implementation.
2193 \end_layout
2194
2195 \begin_layout Subsubsection
2196 Proposed Solution
2197 \end_layout
2198
2199 \begin_layout Standard
2200 We can use various techniques to reduce this for an allocated block:
2201 \end_layout
2202
2203 \begin_layout Enumerate
2204 The 'next' pointer is not required, as we are using a flat hash table.
2205 \end_layout
2206
2207 \begin_layout Enumerate
2208 'rec_len' can instead be expressed as an addition to key_len and data_len
2209  (it accounts for wasted or overallocated length in the record).
2210  Since the record length is always a multiple of 8, we can conveniently
2211  fit it in 32 bits (representing up to 35 bits).
2212 \end_layout
2213
2214 \begin_layout Enumerate
2215 'key_len' and 'data_len' can be reduced.
2216  I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine
2217  the two into one 64-bit field and using a 5 bit value which indicates at
2218  what bit to divide the two.
2219  Keys are unlikely to scale as fast as data, so I'm assuming a maximum key
2220  size of 32 bits.
2221 \end_layout
2222
2223 \begin_layout Enumerate
2224 'full_hash' is used to avoid a memcmp on the
2225 \begin_inset Quotes eld
2226 \end_inset
2227
2228 miss
2229 \begin_inset Quotes erd
2230 \end_inset
2231
2232  case, but this is diminishing returns after a handful of bits (at 10 bits,
2233  it reduces 99.9% of false memcmp).
2234  As an aside, as the lower bits are already incorporated in the hash table
2235  resolution, the upper bits should be used here.
2236
2237 \change_inserted 0 1283336739
2238  Note that it's not clear that these bits will be a win, given the extra
2239  bits in the hash table itself (see
2240 \begin_inset CommandInset ref
2241 LatexCommand ref
2242 reference "sub:Hash-Size-Solution"
2243
2244 \end_inset
2245
2246 ).
2247 \change_unchanged
2248
2249 \end_layout
2250
2251 \begin_layout Enumerate
2252 'magic' does not need to be enlarged: it currently reflects one of 5 values
2253  (used, free, dead, recovery, and unused_recovery).
2254  It is useful for quick sanity checking however, and should not be eliminated.
2255 \end_layout
2256
2257 \begin_layout Enumerate
2258 'tailer' is only used to coalesce free blocks (so a block to the right can
2259  find the header to check if this block is free).
2260  This can be replaced by a single 'free' bit in the header of the following
2261  block (and the tailer only exists in free blocks).
2262 \begin_inset Foot
2263 status collapsed
2264
2265 \begin_layout Plain Layout
2266 This technique from Thomas Standish.
2267  Data Structure Techniques.
2268  Addison-Wesley, Reading, Massachusetts, 1980.
2269 \end_layout
2270
2271 \end_inset
2272
2273  The current proposed coalescing algorithm doesn't need this, however.
2274 \end_layout
2275
2276 \begin_layout Standard
2277 This produces a 16 byte used header like this:
2278 \end_layout
2279
2280 \begin_layout LyX-Code
2281 struct tdb_used_record {
2282 \end_layout
2283
2284 \begin_layout LyX-Code
2285         uint32_t magic : 16,
2286 \end_layout
2287
2288 \begin_layout LyX-Code
2289                  prev_is_free: 1,
2290 \end_layout
2291
2292 \begin_layout LyX-Code
2293                  key_data_divide: 5,
2294 \end_layout
2295
2296 \begin_layout LyX-Code
2297                  top_hash: 10;
2298 \end_layout
2299
2300 \begin_layout LyX-Code
2301         uint32_t extra_octets;
2302 \end_layout
2303
2304 \begin_layout LyX-Code
2305         uint64_t key_and_data_len;
2306 \end_layout
2307
2308 \begin_layout LyX-Code
2309 };
2310 \end_layout
2311
2312 \begin_layout Standard
2313 And a free record like this:
2314 \end_layout
2315
2316 \begin_layout LyX-Code
2317 struct tdb_free_record {
2318 \end_layout
2319
2320 \begin_layout LyX-Code
2321         uint32_t free_magic;
2322 \end_layout
2323
2324 \begin_layout LyX-Code
2325         uint64_t total_length;
2326 \change_inserted 0 1283337133
2327
2328 \end_layout
2329
2330 \begin_layout LyX-Code
2331
2332 \change_inserted 0 1283337139
2333         uint64_t prev, next;
2334 \change_unchanged
2335
2336 \end_layout
2337
2338 \begin_layout LyX-Code
2339         ...
2340 \end_layout
2341
2342 \begin_layout LyX-Code
2343         uint64_t tailer;
2344 \end_layout
2345
2346 \begin_layout LyX-Code
2347 };
2348 \end_layout
2349
2350 \begin_layout Standard
2351
2352 \change_inserted 0 1283337235
2353 We might want to take some bits from the used record's top_hash (and the
2354  free record which has 32 bits of padding to spare anyway) if we use variable
2355  sized zones.
2356  See
2357 \begin_inset CommandInset ref
2358 LatexCommand ref
2359 reference "freelist-in-zone"
2360
2361 \end_inset
2362
2363 .
2364 \change_unchanged
2365
2366 \end_layout
2367
2368 \begin_layout Subsection
2369 Transaction Commit Requires 4 fdatasync
2370 \end_layout
2371
2372 \begin_layout Standard
2373 The current transaction algorithm is:
2374 \end_layout
2375
2376 \begin_layout Enumerate
2377 write_recovery_data();
2378 \end_layout
2379
2380 \begin_layout Enumerate
2381 sync();
2382 \end_layout
2383
2384 \begin_layout Enumerate
2385 write_recovery_header();
2386 \end_layout
2387
2388 \begin_layout Enumerate
2389 sync();
2390 \end_layout
2391
2392 \begin_layout Enumerate
2393 overwrite_with_new_data();
2394 \end_layout
2395
2396 \begin_layout Enumerate
2397 sync();
2398 \end_layout
2399
2400 \begin_layout Enumerate
2401 remove_recovery_header();
2402 \end_layout
2403
2404 \begin_layout Enumerate
2405 sync();
2406 \end_layout
2407
2408 \begin_layout Standard
2409 On current ext3, each sync flushes all data to disk, so the next 3 syncs
2410  are relatively expensive.
2411  But this could become a performance bottleneck on other filesystems such
2412  as ext4.
2413 \end_layout
2414
2415 \begin_layout Subsubsection
2416 Proposed Solution
2417 \end_layout
2418
2419 \begin_layout Standard
2420 Neil Brown points out that this is overzealous, and only one sync is needed:
2421 \end_layout
2422
2423 \begin_layout Enumerate
2424 Bundle the recovery data, a transaction counter and a strong checksum of
2425  the new data.
2426 \end_layout
2427
2428 \begin_layout Enumerate
2429 Strong checksum that whole bundle.
2430 \end_layout
2431
2432 \begin_layout Enumerate
2433 Store the bundle in the database.
2434 \end_layout
2435
2436 \begin_layout Enumerate
2437 Overwrite the oldest of the two recovery pointers in the header (identified
2438  using the transaction counter) with the offset of this bundle.
2439 \end_layout
2440
2441 \begin_layout Enumerate
2442 sync.
2443 \end_layout
2444
2445 \begin_layout Enumerate
2446 Write the new data to the file.
2447 \end_layout
2448
2449 \begin_layout Standard
2450 Checking for recovery means identifying the latest bundle with a valid checksum
2451  and using the new data checksum to ensure that it has been applied.
2452  This is more expensive than the current check, but need only be done at
2453  open.
2454  For running databases, a separate header field can be used to indicate
2455  a transaction in progress; we need only check for recovery if this is set.
2456 \end_layout
2457
2458 \begin_layout Subsection
2459 \begin_inset CommandInset label
2460 LatexCommand label
2461 name "sub:TDB-Does-Not"
2462
2463 \end_inset
2464
2465 TDB Does Not Have Snapshot Support
2466 \end_layout
2467
2468 \begin_layout Subsubsection
2469 Proposed Solution
2470 \change_deleted 0 1284423472
2471
2472 \end_layout
2473
2474 \begin_layout Standard
2475 None.
2476  At some point you say
2477 \begin_inset Quotes eld
2478 \end_inset
2479
2480 use a real database
2481 \begin_inset Quotes erd
2482 \end_inset
2483
2484
2485 \change_inserted 0 1284423891
2486
2487 \change_deleted 0 1284423891
2488 .
2489
2490 \change_inserted 0 1284423901
2491  (but see
2492 \begin_inset CommandInset ref
2493 LatexCommand ref
2494 reference "replay-attribute"
2495
2496 \end_inset
2497
2498 ).
2499 \change_unchanged
2500
2501 \end_layout
2502
2503 \begin_layout Standard
2504 But as a thought experiment, if we implemented transactions to only overwrite
2505  free entries (this is tricky: there must not be a header in each entry
2506  which indicates whether it is free, but use of presence in metadata elsewhere),
2507  and a pointer to the hash table, we could create an entirely new commit
2508  without destroying existing data.
2509  Then it would be easy to implement snapshots in a similar way.
2510 \end_layout
2511
2512 \begin_layout Standard
2513 This would not allow arbitrary changes to the database, such as tdb_repack
2514  does, and would require more space (since we have to preserve the current
2515  and future entries at once).
2516  If we used hash trees rather than one big hash table, we might only have
2517  to rewrite some sections of the hash, too.
2518 \end_layout
2519
2520 \begin_layout Standard
2521 We could then implement snapshots using a similar method, using multiple
2522  different hash tables/free tables.
2523 \change_inserted 0 1284423495
2524
2525 \end_layout
2526
2527 \begin_layout Subsection
2528 Transactions Cannot Operate in Parallel
2529 \end_layout
2530
2531 \begin_layout Standard
2532 This would be useless for ldb, as it hits the index records with just about
2533  every update.
2534  It would add significant complexity in resolving clashes, and cause the
2535  all transaction callers to write their code to loop in the case where the
2536  transactions spuriously failed.
2537 \end_layout
2538
2539 \begin_layout Subsubsection
2540 Proposed Solution
2541 \end_layout
2542
2543 \begin_layout Standard
2544
2545 \change_inserted 0 1284424201
2546 None (but see
2547 \begin_inset CommandInset ref
2548 LatexCommand ref
2549 reference "replay-attribute"
2550
2551 \end_inset
2552
2553 ).
2554
2555 \change_unchanged
2556 We could solve a small part of the problem by providing read-only transactions.
2557  These would allow one write transaction to begin, but it could not commit
2558  until all r/o transactions are done.
2559  This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
2560  commit.
2561 \end_layout
2562
2563 \begin_layout Subsection
2564 Default Hash Function Is Suboptimal
2565 \end_layout
2566
2567 \begin_layout Standard
2568 The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially
2569  if we expand it to 64 bits), and works best when the hash bucket size is
2570  a prime number (which also means a slow modulus).
2571  In addition, it is highly predictable which could potentially lead to a
2572  Denial of Service attack in some TDB uses.
2573 \end_layout
2574
2575 \begin_layout Subsubsection
2576 Proposed Solution
2577 \end_layout
2578
2579 \begin_layout Standard
2580 The Jenkins lookup3 hash
2581 \begin_inset Foot
2582 status open
2583
2584 \begin_layout Plain Layout
2585 http://burtleburtle.net/bob/c/lookup3.c
2586 \end_layout
2587
2588 \end_inset
2589
2590  is a fast and superbly-mixing hash.
2591  It's used by the Linux kernel and almost everything else.
2592  This has the particular properties that it takes an initial seed, and produces
2593  two 32 bit hash numbers, which we can combine into a 64-bit hash.
2594 \end_layout
2595
2596 \begin_layout Standard
2597 The seed should be created at tdb-creation time from some random source,
2598  and placed in the header.
2599  This is far from foolproof, but adds a little bit of protection against
2600  hash bombing.
2601 \end_layout
2602
2603 \begin_layout Subsection
2604 \begin_inset CommandInset label
2605 LatexCommand label
2606 name "Reliable-Traversal-Adds"
2607
2608 \end_inset
2609
2610 Reliable Traversal Adds Complexity
2611 \end_layout
2612
2613 \begin_layout Standard
2614 We lock a record during traversal iteration, and try to grab that lock in
2615  the delete code.
2616  If that grab on delete fails, we simply mark it deleted and continue onwards;
2617  traversal checks for this condition and does the delete when it moves off
2618  the record.
2619 \end_layout
2620
2621 \begin_layout Standard
2622 If traversal terminates, the dead record may be left indefinitely.
2623 \end_layout
2624
2625 \begin_layout Subsubsection
2626 Proposed Solution
2627 \end_layout
2628
2629 \begin_layout Standard
2630 Remove reliability guarantees; see
2631 \begin_inset CommandInset ref
2632 LatexCommand ref
2633 reference "traverse-Proposed-Solution"
2634
2635 \end_inset
2636
2637 .
2638 \end_layout
2639
2640 \begin_layout Subsection
2641 Fcntl Locking Adds Overhead
2642 \end_layout
2643
2644 \begin_layout Standard
2645 Placing a fcntl lock means a system call, as does removing one.
2646  This is actually one reason why transactions can be faster (everything
2647  is locked once at transaction start).
2648  In the uncontended case, this overhead can theoretically be eliminated.
2649 \end_layout
2650
2651 \begin_layout Subsubsection
2652 Proposed Solution
2653 \end_layout
2654
2655 \begin_layout Standard
2656 None.
2657 \end_layout
2658
2659 \begin_layout Standard
2660 We tried this before with spinlock support, in the early days of TDB, and
2661  it didn't make much difference except in manufactured benchmarks.
2662 \end_layout
2663
2664 \begin_layout Standard
2665 We could use spinlocks (with futex kernel support under Linux), but it means
2666  that we lose automatic cleanup when a process dies with a lock.
2667  There is a method of auto-cleanup under Linux, but it's not supported by
2668  other operating systems.
2669  We could reintroduce a clear-if-first-style lock and sweep for dead futexes
2670  on open, but that wouldn't help the normal case of one concurrent opener
2671  dying.
2672  Increasingly elaborate repair schemes could be considered, but they require
2673  an ABI change (everyone must use them) anyway, so there's no need to do
2674  this at the same time as everything else.
2675 \end_layout
2676
2677 \begin_layout Subsection
2678 Some Transactions Don't Require Durability
2679 \end_layout
2680
2681 \begin_layout Standard
2682 Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast)
2683  usage, and occasionally empties the results into a transactional TDB.
2684  This kind of usage prioritizes performance over durability: as long as
2685  we are consistent, data can be lost.
2686 \end_layout
2687
2688 \begin_layout Standard
2689 This would be more neatly implemented inside tdb: a
2690 \begin_inset Quotes eld
2691 \end_inset
2692
2693 soft
2694 \begin_inset Quotes erd
2695 \end_inset
2696
2697  transaction commit (ie.
2698  syncless) which meant that data may be reverted on a crash.
2699 \end_layout
2700
2701 \begin_layout Subsubsection
2702 Proposed Solution
2703 \end_layout
2704
2705 \begin_layout Standard
2706 None.
2707 \end_layout
2708
2709 \begin_layout Standard
2710 Unfortunately any transaction scheme which overwrites old data requires
2711  a sync before that overwrite to avoid the possibility of corruption.
2712 \end_layout
2713
2714 \begin_layout Standard
2715 It seems possible to use a scheme similar to that described in
2716 \begin_inset CommandInset ref
2717 LatexCommand ref
2718 reference "sub:TDB-Does-Not"
2719
2720 \end_inset
2721
2722 ,where transactions are committed without overwriting existing data, and
2723  an array of top-level pointers were available in the header.
2724  If the transaction is
2725 \begin_inset Quotes eld
2726 \end_inset
2727
2728 soft
2729 \begin_inset Quotes erd
2730 \end_inset
2731
2732  then we would not need a sync at all: existing processes would pick up
2733  the new hash table and free list and work with that.
2734 \end_layout
2735
2736 \begin_layout Standard
2737 At some later point, a sync would allow recovery of the old data into the
2738  free lists (perhaps when the array of top-level pointers filled).
2739  On crash, tdb_open() would examine the array of top levels, and apply the
2740  transactions until it encountered an invalid checksum.
2741 \change_inserted 0 1284423555
2742
2743 \end_layout
2744
2745 \begin_layout Subsection
2746
2747 \change_inserted 0 1284423617
2748 Tracing Is Fragile, Replay Is External
2749 \end_layout
2750
2751 \begin_layout Standard
2752
2753 \change_inserted 0 1284423719
2754 The current TDB has compile-time-enabled tracing code, but it often breaks
2755  as it is not enabled by default.
2756  In a similar way, the ctdb code has an external wrapper which does replay
2757  tracing so it can coordinate cluster-wide transactions.
2758 \end_layout
2759
2760 \begin_layout Subsubsection
2761
2762 \change_inserted 0 1284423864
2763 Proposed Solution
2764 \begin_inset CommandInset label
2765 LatexCommand label
2766 name "replay-attribute"
2767
2768 \end_inset
2769
2770
2771 \end_layout
2772
2773 \begin_layout Standard
2774
2775 \change_inserted 0 1284423850
2776 Tridge points out that an attribute can be later added to tdb_open (see
2777
2778 \begin_inset CommandInset ref
2779 LatexCommand ref
2780 reference "attributes"
2781
2782 \end_inset
2783
2784 ) to provide replay/trace hooks, which could become the basis for this and
2785  future parallel transactions and snapshot support.
2786 \change_unchanged
2787
2788 \end_layout
2789
2790 \end_body
2791 \end_document
2792 @
2793
2794
2795 1.9
2796 log
2797 @Extension mechanism.
2798 @
2799 text
2800 @d56 2
2801 a57 2
2802 \change_inserted 0 1284016854
2803 9-September
2804 d479 11
2805 d1303 1
2806 a1303 1
2807 \change_inserted 0 1284016847
2808 d1310 56
2809 d1945 1
2810 a1945 1
2811 \change_inserted 0 1283310945
2812 d1956 2
2813 d2402 2
2814 d2416 4
2815 d2421 12
2816 d2455 2
2817 d2476 12
2818 d2673 47
2819 @
2820
2821
2822 1.8
2823 log
2824 @Remove bogus footnote
2825 @
2826 text
2827 @d56 2
2828 a57 2
2829 \change_inserted 0 1283307544
2830 1-September
2831 d838 12
2832 d1198 103
2833 @
2834
2835
2836 1.7
2837 log
2838 @Moving hash table does not work.
2839 @
2840 text
2841 @a1436 12
2842 \begin_inset Foot
2843 status collapsed
2844
2845 \begin_layout Plain Layout
2846
2847 \change_inserted 0 1283336450
2848 If we make the hash offsets zone-relative, then this only restricts the
2849  zone size, not the overall database size.
2850 \end_layout
2851
2852 \end_inset
2853
2854 @
2855
2856
2857 1.6
2858 log
2859 @Commit changes
2860 @
2861 text
2862 @d38 1
2863 a38 1
2864 \author ""
2865 d53 7
2866 a59 1
2867 26-July-2010
2868 d1333 10
2869 d1361 3
2870 a1363 1
2871  There are three details which become important:
2872 d1367 2
2873 d1373 2
2874 d1379 2
2875 d1385 2
2876 d1397 2
2877 d1407 2
2878 d1411 45
2879 d1582 2
2880 d1598 14
2881 d1733 62
2882 d1996 13
2883 d2086 10
2884 d2110 15
2885 a2124 1
2886 \begin_layout LyX-Code
2887 @
2888
2889
2890 1.5
2891 log
2892 @Soft transaction commit
2893 @
2894 text
2895 @d38 1
2896 a38 1
2897 \author "Rusty Russell,,,"
2898 a52 4
2899
2900 \change_deleted 0 1280141199
2901 10-May-2010
2902 \change_inserted 0 1280141202
2903 a53 2
2904 \change_unchanged
2905
2906 a2028 2
2907
2908 \change_inserted 0 1280140902
2909 a2034 2
2910
2911 \change_unchanged
2912 a2212 2
2913 \change_inserted 0 1280140661
2914
2915 a2215 2
2916
2917 \change_inserted 0 1280140703
2918 a2219 2
2919
2920 \change_inserted 0 1280708312
2921 a2226 2
2922
2923 \change_inserted 0 1280708400
2924 a2239 2
2925
2926 \change_inserted 0 1280140836
2927 a2243 2
2928
2929 \change_inserted 0 1280708255
2930 a2247 2
2931
2932 \change_inserted 0 1280708374
2933 a2252 2
2934
2935 \change_inserted 0 1280141181
2936 a2274 2
2937
2938 \change_inserted 0 1280141345
2939 @
2940
2941
2942 1.4
2943 log
2944 @Merge changes
2945 @
2946 text
2947 @d38 1
2948 a38 1
2949 \author ""
2950 d53 2
2951 d56 4
2952 d2035 10
2953 d2223 84
2954 @
2955
2956
2957 1.3
2958 log
2959 @Transaction and freelist rethink.
2960 @
2961 text
2962 @d38 1
2963 a38 1
2964 \author "Rusty Russell,,,"
2965 d53 1
2966 a53 1
2967 27-April-2010
2968 d662 1
2969 a662 5
2970  behavior of disallowing
2971 \change_inserted 0 1272940179
2972 nested
2973 \change_unchanged
2974 transactions should become the default.
2975 a1210 2
2976 \change_inserted 0 1272944650
2977
2978 a1214 2
2979
2980 \change_inserted 0 1272944763
2981 a1218 2
2982 \change_unchanged
2983
2984 a1223 2
2985 \change_unchanged
2986
2987 a1301 2
2988
2989 \change_inserted 0 1273478114
2990 a1310 2
2991 \change_unchanged
2992
2993 d1515 1
2994 a1515 11
2995 The free list
2996 \change_deleted 0 1273469807
2997 should
2998 \change_inserted 0 1273469810
2999 must
3000 \change_unchanged
3001  be split
3002 \change_deleted 0 1273469815
3003 into multiple lists
3004 \change_unchanged
3005 to reduce contention.
3006 a1520 2
3007 \change_inserted 0 1273470006
3008
3009 a1523 2
3010
3011 \change_inserted 0 1273492055
3012 a1539 2
3013
3014 \change_inserted 0 1273483888
3015 a1551 2
3016 \change_unchanged
3017
3018 a1554 8
3019
3020 \change_deleted 0 1272942055
3021 There are various ways to organize these lisys, but because we want to be
3022  able to quickly identify which free list an entry is in, and reduce the
3023  number of locks required for merging, we will use zoning (eg.
3024  each free list covers some fixed fraction of the file).
3025
3026 \change_inserted 0 1273484187
3027 d1556 1
3028 a1556 7
3029
3030 \change_deleted 0 1273484194
3031 The algorithm for f
3032 \change_inserted 0 1273484194
3033 F
3034 \change_unchanged
3035 reeing is simple:
3036 d1560 1
3037 a1560 7
3038 Identify the correct
3039 \change_deleted 0 1273482856
3040 free list
3041 \change_inserted 0 1273482857
3042 zone
3043 \change_unchanged
3044 .
3045 d1564 1
3046 a1564 7
3047 Lock the
3048 \change_inserted 0 1273482895
3049 corresponding
3050 \change_unchanged
3051 list
3052 \change_inserted 0 1273482863
3053 .
3054 a1567 2
3055
3056 \change_inserted 0 1273482909
3057 d1573 1
3058 a1573 13
3059
3060 \change_deleted 0 1273482885
3061 , and p
3062 \change_inserted 0 1273482888
3063 P
3064 \change_unchanged
3065 lace the freed entry
3066 \change_deleted 0 1273492415
3067 at the head
3068 \change_inserted 0 1273492415
3069 in the list for that zone
3070 \change_unchanged
3071 .
3072 d1577 2
3073 a1578 7
3074 Allocation is a little more complicated, as we
3075 \change_deleted 0 1273483240
3076 merge entries as we walk the list:
3077 \change_inserted 0 1273484250
3078 perform delayed coalescing at this point:
3079 \change_unchanged
3080
3081 d1582 1
3082 a1582 19
3083 Pick a
3084 \change_deleted 0 1273482955
3085 free list;
3086 \change_inserted 0 1273482957
3087 zone
3088 \change_unchanged
3089  either the
3090 \change_deleted 0 1273482962
3091 list
3092 \change_inserted 0 1273482962
3093 zone
3094 \change_unchanged
3095  we last freed
3096 \change_deleted 0 1273482966
3097 o
3098 \change_inserted 0 1273482966
3099 i
3100 \change_unchanged
3101 nto, or based on a
3102 d1594 1
3103 a1594 9
3104 Lock th
3105 \change_inserted 0 1273482980
3106 e corresponding
3107 \change_deleted 0 1273482973
3108 at
3109 \change_unchanged
3110  list.
3111 \change_inserted 0 1273482982
3112
3113 a1597 2
3114
3115 \change_inserted 0 1273483084
3116 a1598 53
3117 \change_unchanged
3118
3119 \end_layout
3120
3121 \begin_layout Enumerate
3122 If the top entry is
3123 \change_deleted 0 1273492155
3124 well-sized,
3125 \change_inserted 0 1273492159
3126 -large enough,
3127 \change_unchanged
3128 remove it from the list and return it.
3129 \end_layout
3130
3131 \begin_layout Enumerate
3132 Otherwise,
3133 \change_inserted 0 1273492206
3134 coalesce entries in the list.
3135 \change_deleted 0 1273492200
3136 examine the entry to the right of it in the file.
3137  If it is free:
3138 \end_layout
3139
3140 \begin_deeper
3141 \begin_layout Enumerate
3142
3143 \change_deleted 0 1273492200
3144 If that entry is in a different list, lock that list too.
3145 \end_layout
3146
3147 \begin_layout Enumerate
3148
3149 \change_deleted 0 1273492200
3150 If we had to place a new lock, re-check that the entry is free.
3151 \end_layout
3152
3153 \begin_layout Enumerate
3154
3155 \change_deleted 0 1273492200
3156 Remove that entry from its free list and expand this entry to cover it.
3157 \end_layout
3158
3159 \begin_layout Enumerate
3160
3161 \change_deleted 0 1273485554
3162 Goto step 3.
3163 \end_layout
3164
3165 \end_deeper
3166 \begin_layout Enumerate
3167
3168 \change_inserted 0 1273485311
3169 If there was no entry large enough, unlock the list and try the next zone.
3170 d1602 1
3171 a1602 5
3172
3173 \change_deleted 0 1273483646
3174 Repeat step 3 with each entry in the list.
3175 \change_unchanged
3176
3177 d1606 2
3178 a1607 5
3179
3180 \change_deleted 0 1273483668
3181 Unlock the list and repeat step 2 with the next list.
3182 \change_unchanged
3183
3184 d1611 1
3185 a1611 7
3186 If no
3187 \change_deleted 0 1273483671
3188 list
3189 \change_inserted 0 1273483671
3190 zone
3191 \change_unchanged
3192  satisfies, expand the file.
3193 d1615 2
3194 a1616 9
3195 This optimizes rapid insert/delete of free list entries
3196 \change_inserted 0 1273485794
3197  by not coalescing them all the time.
3198 \change_deleted 0 1273483685
3199 , and allows us to get rid of the tailer altogether
3200 \change_unchanged
3201 .
3202
3203 \change_inserted 0 1273492299
3204 a1638 39
3205
3206 \change_deleted 0 1273476840
3207 The question of
3208 \begin_inset Quotes eld
3209 \end_inset
3210
3211 well-sized
3212 \begin_inset Quotes erd
3213 \end_inset
3214
3215  free entries is more difficult: the 25% overhead works in practice for
3216  ldb because indexes tend to expand by one record at a time.
3217  This can be resolved by having an
3218 \begin_inset Quotes eld
3219 \end_inset
3220
3221 expanded
3222 \begin_inset Quotes erd
3223 \end_inset
3224
3225  bit in the header to note entries that have previously expanded, and allocating
3226  more space for them.
3227  Whether the
3228 \begin_inset Quotes eld
3229 \end_inset
3230
3231 increasing slack
3232 \begin_inset Quotes erd
3233 \end_inset
3234
3235  algorithm should be implemented or first-fit used is still unknown: we
3236  will determine this once these other ideas are implemented.
3237 \change_inserted 0 1273483750
3238
3239 \end_layout
3240
3241 \begin_layout Standard
3242
3243 \change_inserted 0 1273492450
3244 a1644 2
3245
3246 \change_inserted 0 1273470441
3247 a1654 2
3248
3249 \change_inserted 0 1273476556
3250 a1659 2
3251
3252 \change_inserted 0 1273470423
3253 a1661 2
3254 \change_unchanged
3255
3256 a1672 2
3257
3258 \change_inserted 0 1273476847
3259 a1676 2
3260
3261 \change_inserted 0 1273476886
3262 a1691 2
3263
3264 \change_inserted 0 1273477233
3265 a1699 2
3266
3267 \change_inserted 0 1273477534
3268 a1706 2
3269
3270 \change_inserted 0 1273482700
3271 a1712 2
3272
3273 \change_inserted 0 1273478079
3274 a1722 2
3275
3276 \change_inserted 0 1273477839
3277 a1726 2
3278
3279 \change_inserted 0 1273477925
3280 a1730 2
3281
3282 \change_inserted 0 1273477925
3283 a1734 2
3284
3285 \change_inserted 0 1273477925
3286 a1738 2
3287
3288 \change_inserted 0 1273477925
3289 a1742 2
3290
3291 \change_inserted 0 1273477925
3292 a1746 2
3293
3294 \change_inserted 0 1273477925
3295 a1750 2
3296
3297 \change_inserted 0 1273477925
3298 a1754 2
3299
3300 \change_inserted 0 1273477925
3301 a1758 2
3302
3303 \change_inserted 0 1273477925
3304 a1762 2
3305
3306 \change_inserted 0 1273477925
3307 a1766 2
3308
3309 \change_inserted 0 1273477925
3310 a1770 2
3311
3312 \change_inserted 0 1273477925
3313 a1774 2
3314
3315 \change_inserted 0 1273477925
3316 a1778 2
3317
3318 \change_inserted 0 1273477925
3319 a1782 2
3320
3321 \change_inserted 0 1273477925
3322 a1786 2
3323
3324 \change_inserted 0 1273477925
3325 a1790 2
3326
3327 \change_inserted 0 1273477925
3328 a1794 2
3329
3330 \change_inserted 0 1273477925
3331 a1798 2
3332
3333 \change_inserted 0 1273492522
3334 a1802 2
3335
3336 \change_inserted 0 1273492530
3337 a1806 2
3338
3339 \change_inserted 0 1273492546
3340 a1810 2
3341
3342 \change_inserted 0 1273478239
3343 a1814 2
3344
3345 \change_inserted 0 1273479960
3346 a1821 2
3347
3348 \change_inserted 0 1273480265
3349 a1830 2
3350
3351 \change_inserted 0 1273480354
3352 a1845 2
3353
3354 \change_inserted 0 1273478968
3355 a1851 2
3356
3357 \change_inserted 0 1273492604
3358 a1859 2
3359
3360 \change_inserted 0 1273479572
3361 a1862 2
3362 \change_unchanged
3363
3364 a1870 2
3365
3366 \change_inserted 0 1273480282
3367 a1874 2
3368
3369 \change_inserted 0 1273478931
3370 a1878 2
3371
3372 \change_inserted 0 1273481549
3373 a1882 2
3374
3375 \change_inserted 0 1273481557
3376 a1886 2
3377
3378 \change_inserted 0 1273480307
3379 a1890 2
3380
3381 \change_inserted 0 1273480335
3382 a1894 2
3383
3384 \change_inserted 0 1273479897
3385 a1898 2
3386
3387 \change_inserted 0 1273479653
3388 a1902 2
3389
3390 \change_inserted 0 1273480371
3391 a1906 2
3392
3393 \change_inserted 0 1273480464
3394 a1910 2
3395
3396 \change_inserted 0 1273480399
3397 a1914 2
3398
3399 \change_inserted 0 1273480425
3400 a1918 2
3401
3402 \change_inserted 0 1273480453
3403 a1922 2
3404
3405 \change_inserted 0 1273480455
3406 a1926 2
3407
3408 \change_inserted 0 1273480450
3409 a1930 2
3410
3411 \change_inserted 0 1273480452
3412 a1935 2
3413 \change_inserted 0 1273478830
3414
3415 a1942 5
3416
3417 \change_deleted 0 1273481604
3418 In theory, we could get away with 2: one after we write the new data, and
3419  one to somehow atomically change over to it.
3420 \change_inserted 0 1273481632
3421 a1946 2
3422
3423 \change_inserted 0 1273481724
3424 a1950 2
3425
3426 \change_inserted 0 1273481713
3427 a1954 2
3428
3429 \change_inserted 0 1273481717
3430 a1958 2
3431
3432 \change_inserted 0 1273481730
3433 a1962 2
3434
3435 \change_inserted 0 1273481736
3436 a1966 2
3437
3438 \change_inserted 0 1273481744
3439 a1970 2
3440
3441 \change_inserted 0 1273481748
3442 a1974 2
3443
3444 \change_inserted 0 1273482185
3445 a1978 2
3446
3447 \change_inserted 0 1273482259
3448 a1989 50
3449
3450 \change_deleted 0 1273481848
3451 None.
3452  Trying to rewrite the transaction code is a separate experiment, which
3453  I encourage someone else to do.
3454  At some point you say
3455 \begin_inset Quotes eld
3456 \end_inset
3457
3458 use a real database
3459 \begin_inset Quotes erd
3460 \end_inset
3461
3462 .
3463 \end_layout
3464
3465 \begin_layout Standard
3466
3467 \change_deleted 0 1273481848
3468 But as a thought experiment:
3469 \change_unchanged
3470
3471 \end_layout
3472
3473 \begin_layout Standard
3474
3475 \change_deleted 0 1273481788
3476 Say there was a pointer in the header which said where the hash table and
3477  free list tables were, and that no blocks were labeled with whether they
3478  were free or not (it had to be derived from what list they were in).
3479  We could create new hash table and free list in some free space, and populate
3480  it as we want the post-committed state to look.
3481  Then we sync, then we switch the offset in the header, then we sync again.
3482 \end_layout
3483
3484 \begin_layout Standard
3485
3486 \change_deleted 0 1273481788
3487 This would not allow arbitrary changes to the database, such as tdb_repack
3488  does, and would require more space (since we have to preserve the current
3489  and future entries at once).
3490  If we used hash trees rather than one big hash table, we might only have
3491  to rewrite some sections of the hash, too.
3492 \change_inserted 0 1273481854
3493
3494 \end_layout
3495
3496 \begin_layout Standard
3497
3498 \change_inserted 0 1273482102
3499 a1993 2
3500
3501 \change_inserted 0 1273482061
3502 a1998 2
3503
3504 \change_inserted 0 1273482063
3505 a2002 2
3506
3507 \change_inserted 0 1273482072
3508 a2006 2
3509
3510 \change_inserted 0 1273482139
3511 a2011 2
3512
3513 \change_inserted 0 1273482364
3514 a2015 2
3515
3516 \change_inserted 0 1273482163
3517 a2019 2
3518
3519 \change_inserted 0 1273482493
3520 a2037 2
3521
3522 \change_inserted 0 1273482536
3523 a2046 2
3524 \change_unchanged
3525
3526 a2049 2
3527
3528 \change_inserted 0 1273482641
3529 a2058 2
3530
3531 \change_inserted 0 1273481827
3532 d2067 2
3533 a2068 11
3534 We could
3535 \change_inserted 0 1273481829
3536 then
3537 \change_unchanged
3538 implement snapshots using a similar method
3539 \change_deleted 0 1273481838
3540  to the above, only
3541 \change_inserted 0 1273481840
3542 ,
3543 \change_unchanged
3544  using multiple different hash tables/free tables.
3545 @
3546
3547
3548 1.2
3549 log
3550 @After first feedback (Ronnie & Volker)
3551 @
3552 text
3553 @d1314 13
3554 d1531 11
3555 a1541 1
3556 The free list should be split into multiple lists to reduce contention.
3557 d1547 39
3558 d1596 7
3559 d1604 1
3560 a1604 1
3561 The algorithm for freeing is simple:
3562 d1608 7
3563 a1614 1
3564 Identify the correct free list.
3565 d1618 30
3566 a1647 1
3567 Lock the list, and place the freed entry at the head.
3568 d1651 7
3569 a1657 2
3570 Allocation is a little more complicated, as we merge entries as we walk
3571  the list:
3572 d1661 19
3573 a1679 1
3574 Pick a free list; either the list we last freed onto, or based on a
3575 d1691 17
3576 a1707 1
3577 Lock that list.
3578 d1711 7
3579 a1717 1
3580 If the top entry is well-sized, remove it from the list and return it.
3581 d1721 5
3582 a1725 1
3583 Otherwise, examine the entry to the right of it in the file.
3584 d1731 2
3585 d1737 2
3586 d1743 2
3587 d1749 2
3588 d1756 8
3589 d1765 2
3590 d1770 2
3591 d1773 2
3592 d1778 7
3593 a1784 1
3594 If no list satisfies, expand the file.
3595 d1788 28
3596 a1815 2
3597 This optimizes rapid insert/delete of free list entries, and allows us to
3598  get rid of the tailer altogether.
3599 d1819 2
3600 d1851 1
3601 a1851 1
3602 \change_inserted 0 1272941474
3603 d1857 303
3604 a2159 18
3605 \change_inserted 0 1272942759
3606 There are various ways to organize these lists, but because we want to be
3607  able to quickly identify which free list an entry is in, and reduce the
3608  number of locks required for merging, we will use zoning (eg.
3609  each of the N free lists in a tdb file of size M covers a fixed fraction
3610  M/N).
3611  Note that this means we need to reshuffle the free lists when we expand
3612  the file; this is probably acceptable when we double the hash table size,
3613  since that is such an expensive operation already.
3614  In the case of increasing the file size, there is an optimization we can
3615  use: if we use M in the formula above as the file size rounded up to the
3616  next power of 2, we only need reshuffle free lists when the file size crosses
3617  a power of 2 boundary,
3618 \emph on
3619 and
3620 \emph default
3621 reshuffling the free lists is trivial: we simply merge every consecutive
3622  pair of free lists.
3623 d2164 107
3624 d2276 2
3625 d2280 59
3626 d2346 2
3627 d2363 2
3628 d2366 2
3629 d2371 2
3630 d2382 2
3631 d2389 57
3632 d2458 13
3633 d2474 32
3634 a2505 2
3635 We could implement snapshots using a similar method to the above, only using
3636  multiple different hash tables/free tables.
3637 @
3638
3639
3640 1.1
3641 log
3642 @Initial revision
3643 @
3644 text
3645 @d1 1
3646 a1 1
3647 #LyX 1.6.4 created this file. For more info see http://www.lyx.org/
3648 d36 3
3649 a38 3
3650 \tracking_changes false
3651 \output_changes false
3652 \author ""
3653 d662 5
3654 a666 1
3655  behavior of disallowing transactions should become the default.
3656 d1215 21
3657 d1527 2
3658 d1533 3
3659 a1535 1
3660  The algorithm for freeing is simple:
3661 d1642 26
3662 @