git.ozlabs.org Git - ccan/blob - ccan/tdb2/doc/design.lyx,v

   1 head    1.6;
   2 access;
   3 symbols;
   4 locks; strict;
   5 comment @# @;
   6
   7
   8 1.6
   9 date    2010.08.02.00.21.43;    author rusty;   state Exp;
  10 branches;
  11 next    1.5;
  12
  13 1.5
  14 date    2010.08.02.00.21.16;    author rusty;   state Exp;
  15 branches;
  16 next    1.4;
  17
  18 1.4
  19 date    2010.05.10.13.09.11;    author rusty;   state Exp;
  20 branches;
  21 next    1.3;
  22
  23 1.3
  24 date    2010.05.10.11.58.37;    author rusty;   state Exp;
  25 branches;
  26 next    1.2;
  27
  28 1.2
  29 date    2010.05.10.05.35.13;    author rusty;   state Exp;
  30 branches;
  31 next    1.1;
  32
  33 1.1
  34 date    2010.05.04.02.29.16;    author rusty;   state Exp;
  35 branches;
  36 next    ;
  37
  38
  39 desc
  40 @First draft
  41 @
  42
  43
  44 1.6
  45 log
  46 @Commit changes
  47 @
  48 text
  49 @#LyX 1.6.5 created this file. For more info see http://www.lyx.org/
  50 \lyxformat 345
  51 \begin_document
  52 \begin_header
  53 \textclass article
  54 \use_default_options true
  55 \language english
  56 \inputencoding auto
  57 \font_roman default
  58 \font_sans default
  59 \font_typewriter default
  60 \font_default_family default
  61 \font_sc false
  62 \font_osf false
  63 \font_sf_scale 100
  64 \font_tt_scale 100
  65
  66 \graphics default
  67 \paperfontsize default
  68 \use_hyperref false
  69 \papersize default
  70 \use_geometry false
  71 \use_amsmath 1
  72 \use_esint 1
  73 \cite_engine basic
  74 \use_bibtopic false
  75 \paperorientation portrait
  76 \secnumdepth 3
  77 \tocdepth 3
  78 \paragraph_separation indent
  79 \defskip medskip
  80 \quotes_language english
  81 \papercolumns 1
  82 \papersides 1
  83 \paperpagestyle default
  84 \tracking_changes true
  85 \output_changes true
  86 \author ""
  87 \author ""
  88 \end_header
  89
  90 \begin_body
  91
  92 \begin_layout Title
  93 TDB2: A Redesigning The Trivial DataBase
  94 \end_layout
  95
  96 \begin_layout Author
  97 Rusty Russell, IBM Corporation
  98 \end_layout
  99
 100 \begin_layout Date
 101 26-July-2010
 102 \end_layout
 103
 104 \begin_layout Abstract
 105 The Trivial DataBase on-disk format is 32 bits; with usage cases heading
 106  towards the 4G limit, that must change.
 107  This required breakage provides an opportunity to revisit TDB's other design
 108  decisions and reassess them.
 109 \end_layout
 110
 111 \begin_layout Section
 112 Introduction
 113 \end_layout
 114
 115 \begin_layout Standard
 116 The Trivial DataBase was originally written by Andrew Tridgell as a simple
 117  key/data pair storage system with the same API as dbm, but allowing multiple
 118  readers and writers while being small enough (< 1000 lines of C) to include
 119  in SAMBA.
 120  The simple design created in 1999 has proven surprisingly robust and performant
 121 , used in Samba versions 3 and 4 as well as numerous other projects.
 122  Its useful life was greatly increased by the (backwards-compatible!) addition
 123  of transaction support in 2005.
 124 \end_layout
 125
 126 \begin_layout Standard
 127 The wider variety and greater demands of TDB-using code has lead to some
 128  organic growth of the API, as well as some compromises on the implementation.
 129  None of these, by themselves, are seen as show-stoppers, but the cumulative
 130  effect is to a loss of elegance over the initial, simple TDB implementation.
 131  Here is a table of the approximate number of lines of implementation code
 132  and number of API functions at the end of each year:
 133 \end_layout
 134
 135 \begin_layout Standard
 136 \begin_inset Tabular
 137 <lyxtabular version="3" rows="12" columns="3">
 138 <features>
 139 <column alignment="center" valignment="top" width="0">
 140 <column alignment="center" valignment="top" width="0">
 141 <column alignment="center" valignment="top" width="0">
 142 <row>
 143 <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
 144 \begin_inset Text
 145
 146 \begin_layout Plain Layout
 147 Year End
 148 \end_layout
 149
 150 \end_inset
 151 </cell>
 152 <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
 153 \begin_inset Text
 154
 155 \begin_layout Plain Layout
 156 API Functions
 157 \end_layout
 158
 159 \end_inset
 160 </cell>
 161 <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
 162 \begin_inset Text
 163
 164 \begin_layout Plain Layout
 165 Lines of C Code Implementation
 166 \end_layout
 167
 168 \end_inset
 169 </cell>
 170 </row>
 171 <row>
 172 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 173 \begin_inset Text
 174
 175 \begin_layout Plain Layout
 176 1999
 177 \end_layout
 178
 179 \end_inset
 180 </cell>
 181 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 182 \begin_inset Text
 183
 184 \begin_layout Plain Layout
 185 13
 186 \end_layout
 187
 188 \end_inset
 189 </cell>
 190 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 191 \begin_inset Text
 192
 193 \begin_layout Plain Layout
 194 1195
 195 \end_layout
 196
 197 \end_inset
 198 </cell>
 199 </row>
 200 <row>
 201 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 202 \begin_inset Text
 203
 204 \begin_layout Plain Layout
 205 2000
 206 \end_layout
 207
 208 \end_inset
 209 </cell>
 210 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 211 \begin_inset Text
 212
 213 \begin_layout Plain Layout
 214 24
 215 \end_layout
 216
 217 \end_inset
 218 </cell>
 219 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 220 \begin_inset Text
 221
 222 \begin_layout Plain Layout
 223 1725
 224 \end_layout
 225
 226 \end_inset
 227 </cell>
 228 </row>
 229 <row>
 230 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 231 \begin_inset Text
 232
 233 \begin_layout Plain Layout
 234 2001
 235 \end_layout
 236
 237 \end_inset
 238 </cell>
 239 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 240 \begin_inset Text
 241
 242 \begin_layout Plain Layout
 243 32
 244 \end_layout
 245
 246 \end_inset
 247 </cell>
 248 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 249 \begin_inset Text
 250
 251 \begin_layout Plain Layout
 252 2228
 253 \end_layout
 254
 255 \end_inset
 256 </cell>
 257 </row>
 258 <row>
 259 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 260 \begin_inset Text
 261
 262 \begin_layout Plain Layout
 263 2002
 264 \end_layout
 265
 266 \end_inset
 267 </cell>
 268 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 269 \begin_inset Text
 270
 271 \begin_layout Plain Layout
 272 35
 273 \end_layout
 274
 275 \end_inset
 276 </cell>
 277 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 278 \begin_inset Text
 279
 280 \begin_layout Plain Layout
 281 2481
 282 \end_layout
 283
 284 \end_inset
 285 </cell>
 286 </row>
 287 <row>
 288 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 289 \begin_inset Text
 290
 291 \begin_layout Plain Layout
 292 2003
 293 \end_layout
 294
 295 \end_inset
 296 </cell>
 297 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 298 \begin_inset Text
 299
 300 \begin_layout Plain Layout
 301 35
 302 \end_layout
 303
 304 \end_inset
 305 </cell>
 306 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 307 \begin_inset Text
 308
 309 \begin_layout Plain Layout
 310 2552
 311 \end_layout
 312
 313 \end_inset
 314 </cell>
 315 </row>
 316 <row>
 317 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 318 \begin_inset Text
 319
 320 \begin_layout Plain Layout
 321 2004
 322 \end_layout
 323
 324 \end_inset
 325 </cell>
 326 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 327 \begin_inset Text
 328
 329 \begin_layout Plain Layout
 330 40
 331 \end_layout
 332
 333 \end_inset
 334 </cell>
 335 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 336 \begin_inset Text
 337
 338 \begin_layout Plain Layout
 339 2584
 340 \end_layout
 341
 342 \end_inset
 343 </cell>
 344 </row>
 345 <row>
 346 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 347 \begin_inset Text
 348
 349 \begin_layout Plain Layout
 350 2005
 351 \end_layout
 352
 353 \end_inset
 354 </cell>
 355 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 356 \begin_inset Text
 357
 358 \begin_layout Plain Layout
 359 38
 360 \end_layout
 361
 362 \end_inset
 363 </cell>
 364 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 365 \begin_inset Text
 366
 367 \begin_layout Plain Layout
 368 2647
 369 \end_layout
 370
 371 \end_inset
 372 </cell>
 373 </row>
 374 <row>
 375 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 376 \begin_inset Text
 377
 378 \begin_layout Plain Layout
 379 2006
 380 \end_layout
 381
 382 \end_inset
 383 </cell>
 384 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 385 \begin_inset Text
 386
 387 \begin_layout Plain Layout
 388 52
 389 \end_layout
 390
 391 \end_inset
 392 </cell>
 393 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 394 \begin_inset Text
 395
 396 \begin_layout Plain Layout
 397 3754
 398 \end_layout
 399
 400 \end_inset
 401 </cell>
 402 </row>
 403 <row>
 404 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 405 \begin_inset Text
 406
 407 \begin_layout Plain Layout
 408 2007
 409 \end_layout
 410
 411 \end_inset
 412 </cell>
 413 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 414 \begin_inset Text
 415
 416 \begin_layout Plain Layout
 417 66
 418 \end_layout
 419
 420 \end_inset
 421 </cell>
 422 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 423 \begin_inset Text
 424
 425 \begin_layout Plain Layout
 426 4398
 427 \end_layout
 428
 429 \end_inset
 430 </cell>
 431 </row>
 432 <row>
 433 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 434 \begin_inset Text
 435
 436 \begin_layout Plain Layout
 437 2008
 438 \end_layout
 439
 440 \end_inset
 441 </cell>
 442 <cell alignment="center" valignment="top" topline="true" leftline="true" usebox="none">
 443 \begin_inset Text
 444
 445 \begin_layout Plain Layout
 446 71
 447 \end_layout
 448
 449 \end_inset
 450 </cell>
 451 <cell alignment="center" valignment="top" topline="true" leftline="true" rightline="true" usebox="none">
 452 \begin_inset Text
 453
 454 \begin_layout Plain Layout
 455 4768
 456 \end_layout
 457
 458 \end_inset
 459 </cell>
 460 </row>
 461 <row>
 462 <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
 463 \begin_inset Text
 464
 465 \begin_layout Plain Layout
 466 2009
 467 \end_layout
 468
 469 \end_inset
 470 </cell>
 471 <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" usebox="none">
 472 \begin_inset Text
 473
 474 \begin_layout Plain Layout
 475 73
 476 \end_layout
 477
 478 \end_inset
 479 </cell>
 480 <cell alignment="center" valignment="top" topline="true" bottomline="true" leftline="true" rightline="true" usebox="none">
 481 \begin_inset Text
 482
 483 \begin_layout Plain Layout
 484 5715
 485 \end_layout
 486
 487 \end_inset
 488 </cell>
 489 </row>
 490 </lyxtabular>
 491
 492 \end_inset
 493
 494
 495 \end_layout
 496
 497 \begin_layout Standard
 498 This review is an attempt to catalog and address all the known issues with
 499  TDB and create solutions which address the problems without significantly
 500  increasing complexity; all involved are far too aware of the dangers of
 501  second system syndrome in rewriting a successful project like this.
 502 \end_layout
 503
 504 \begin_layout Section
 505 API Issues
 506 \end_layout
 507
 508 \begin_layout Subsection
 509 tdb_open_ex Is Not Expandable
 510 \end_layout
 511
 512 \begin_layout Standard
 513 The tdb_open() call was expanded to tdb_open_ex(), which added an optional
 514  hashing function and an optional logging function argument.
 515  Additional arguments to open would require the introduction of a tdb_open_ex2
 516  call etc.
 517 \end_layout
 518
 519 \begin_layout Subsubsection
 520 Proposed Solution
 521 \end_layout
 522
 523 \begin_layout Standard
 524 tdb_open() will take a linked-list of attributes:
 525 \end_layout
 526
 527 \begin_layout LyX-Code
 528 enum tdb_attribute {
 529 \end_layout
 530
 531 \begin_layout LyX-Code
 532     TDB_ATTRIBUTE_LOG = 0,
 533 \end_layout
 534
 535 \begin_layout LyX-Code
 536     TDB_ATTRIBUTE_HASH = 1
 537 \end_layout
 538
 539 \begin_layout LyX-Code
 540 };
 541 \end_layout
 542
 543 \begin_layout LyX-Code
 544 struct tdb_attribute_base {
 545 \end_layout
 546
 547 \begin_layout LyX-Code
 548     enum tdb_attribute attr;
 549 \end_layout
 550
 551 \begin_layout LyX-Code
 552     union tdb_attribute *next;
 553 \end_layout
 554
 555 \begin_layout LyX-Code
 556 };
 557 \end_layout
 558
 559 \begin_layout LyX-Code
 560 struct tdb_attribute_log {
 561 \end_layout
 562
 563 \begin_layout LyX-Code
 564     struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_LOG */
 565 \end_layout
 566
 567 \begin_layout LyX-Code
 568     tdb_log_func log_fn;
 569 \end_layout
 570
 571 \begin_layout LyX-Code
 572     void *log_private;
 573 \end_layout
 574
 575 \begin_layout LyX-Code
 576 };
 577 \end_layout
 578
 579 \begin_layout LyX-Code
 580 struct tdb_attribute_hash {
 581 \end_layout
 582
 583 \begin_layout LyX-Code
 584     struct tdb_attribute_base base; /* .attr = TDB_ATTRIBUTE_HASH */
 585 \end_layout
 586
 587 \begin_layout LyX-Code
 588     tdb_hash_func hash_fn;
 589 \end_layout
 590
 591 \begin_layout LyX-Code
 592     void *hash_private;
 593 \end_layout
 594
 595 \begin_layout LyX-Code
 596 };
 597 \end_layout
 598
 599 \begin_layout LyX-Code
 600 union tdb_attribute {
 601 \end_layout
 602
 603 \begin_layout LyX-Code
 604     struct tdb_attribute_base base;
 605 \end_layout
 606
 607 \begin_layout LyX-Code
 608     struct tdb_attribute_log log;
 609 \end_layout
 610
 611 \begin_layout LyX-Code
 612     struct tdb_attribute_hash hash;
 613 \end_layout
 614
 615 \begin_layout LyX-Code
 616 };
 617 \end_layout
 618
 619 \begin_layout Standard
 620 This allows future attributes to be added, even if this expands the size
 621  of the union.
 622 \end_layout
 623
 624 \begin_layout Subsection
 625 tdb_traverse Makes Impossible Guarantees
 626 \end_layout
 627
 628 \begin_layout Standard
 629 tdb_traverse (and tdb_firstkey/tdb_nextkey) predate transactions, and it
 630  was thought that it was important to guarantee that all records which exist
 631  at the start and end of the traversal would be included, and no record
 632  would be included twice.
 633 \end_layout
 634
 635 \begin_layout Standard
 636 This adds complexity (see
 637 \begin_inset CommandInset ref
 638 LatexCommand ref
 639 reference "Reliable-Traversal-Adds"
 640
 641 \end_inset
 642
 643 ) and does not work anyway for records which are altered (in particular,
 644  those which are expanded may be effectively deleted and re-added behind
 645  the traversal).
 646 \end_layout
 647
 648 \begin_layout Subsubsection
 649 \begin_inset CommandInset label
 650 LatexCommand label
 651 name "traverse-Proposed-Solution"
 652
 653 \end_inset
 654
 655 Proposed Solution
 656 \end_layout
 657
 658 \begin_layout Standard
 659 Abandon the guarantee.
 660  You will see every record if no changes occur during your traversal, otherwise
 661  you will see some subset.
 662  You can prevent changes by using a transaction or the locking API.
 663 \end_layout
 664
 665 \begin_layout Subsection
 666 Nesting of Transactions Is Fraught
 667 \end_layout
 668
 669 \begin_layout Standard
 670 TDB has alternated between allowing nested transactions and not allowing
 671  them.
 672  Various paths in the Samba codebase assume that transactions will nest,
 673  and in a sense they can: the operation is only committed to disk when the
 674  outer transaction is committed.
 675  There are two problems, however:
 676 \end_layout
 677
 678 \begin_layout Enumerate
 679 Canceling the inner transaction will cause the outer transaction commit
 680  to fail, and will not undo any operations since the inner transaction began.
 681  This problem is soluble with some additional internal code.
 682 \end_layout
 683
 684 \begin_layout Enumerate
 685 An inner transaction commit can be cancelled by the outer transaction.
 686  This is desirable in the way which Samba's database initialization code
 687  uses transactions, but could be a surprise to any users expecting a successful
 688  transaction commit to expose changes to others.
 689 \end_layout
 690
 691 \begin_layout Standard
 692 The current solution is to specify the behavior at tdb_open(), with the
 693  default currently that nested transactions are allowed.
 694  This flag can also be changed at runtime.
 695 \end_layout
 696
 697 \begin_layout Subsubsection
 698 Proposed Solution
 699 \end_layout
 700
 701 \begin_layout Standard
 702 Given the usage patterns, it seems that the
 703 \begin_inset Quotes eld
 704 \end_inset
 705
 706 least-surprise
 707 \begin_inset Quotes erd
 708 \end_inset
 709
 710  behavior of disallowing nested transactions should become the default.
 711  Additionally, it seems the outer transaction is the only code which knows
 712  whether inner transactions should be allowed, so a flag to indicate this
 713  could be added to tdb_transaction_start.
 714  However, this behavior can be simulated with a wrapper which uses tdb_add_flags
 715 () and tdb_remove_flags(), so the API should not be expanded for this relatively
 716 -obscure case.
 717 \end_layout
 718
 719 \begin_layout Subsection
 720 Incorrect Hash Function is Not Detected
 721 \end_layout
 722
 723 \begin_layout Standard
 724 tdb_open_ex() allows the calling code to specify a different hash function
 725  to use, but does not check that all other processes accessing this tdb
 726  are using the same hash function.
 727  The result is that records are missing from tdb_fetch().
 728 \end_layout
 729
 730 \begin_layout Subsubsection
 731 Proposed Solution
 732 \end_layout
 733
 734 \begin_layout Standard
 735 The header should contain an example hash result (eg.
 736  the hash of 0xdeadbeef), and tdb_open_ex() should check that the given
 737  hash function produces the same answer, or fail the tdb_open call.
 738 \end_layout
 739
 740 \begin_layout Subsection
 741 tdb_set_max_dead/TDB_VOLATILE Expose Implementation
 742 \end_layout
 743
 744 \begin_layout Standard
 745 In response to scalability issues with the free list (
 746 \begin_inset CommandInset ref
 747 LatexCommand ref
 748 reference "TDB-Freelist-Is"
 749
 750 \end_inset
 751
 752 ) two API workarounds have been incorporated in TDB: tdb_set_max_dead()
 753  and the TDB_VOLATILE flag to tdb_open.
 754  The latter actually calls the former with an argument of
 755 \begin_inset Quotes eld
 756 \end_inset
 757
 758 5
 759 \begin_inset Quotes erd
 760 \end_inset
 761
 762 .
 763 \end_layout
 764
 765 \begin_layout Standard
 766 This code allows deleted records to accumulate without putting them in the
 767  free list.
 768  On delete we iterate through each chain and free them in a batch if there
 769  are more than max_dead entries.
 770  These are never otherwise recycled except as a side-effect of a tdb_repack.
 771 \end_layout
 772
 773 \begin_layout Subsubsection
 774 Proposed Solution
 775 \end_layout
 776
 777 \begin_layout Standard
 778 With the scalability problems of the freelist solved, this API can be removed.
 779  The TDB_VOLATILE flag may still be useful as a hint that store and delete
 780  of records will be at least as common as fetch in order to allow some internal
 781  tuning, but initially will become a no-op.
 782 \end_layout
 783
 784 \begin_layout Subsection
 785 \begin_inset CommandInset label
 786 LatexCommand label
 787 name "TDB-Files-Cannot"
 788
 789 \end_inset
 790
 791 TDB Files Cannot Be Opened Multiple Times In The Same Process
 792 \end_layout
 793
 794 \begin_layout Standard
 795 No process can open the same TDB twice; we check and disallow it.
 796  This is an unfortunate side-effect of fcntl locks, which operate on a per-file
 797  rather than per-file-descriptor basis, and do not nest.
 798  Thus, closing any file descriptor on a file clears all the locks obtained
 799  by this process, even if they were placed using a different file descriptor!
 800 \end_layout
 801
 802 \begin_layout Standard
 803 Note that even if this were solved, deadlock could occur if operations were
 804  nested: this is a more manageable programming error in most cases.
 805 \end_layout
 806
 807 \begin_layout Subsubsection
 808 Proposed Solution
 809 \end_layout
 810
 811 \begin_layout Standard
 812 We could lobby POSIX to fix the perverse rules, or at least lobby Linux
 813  to violate them so that the most common implementation does not have this
 814  restriction.
 815  This would be a generally good idea for other fcntl lock users.
 816 \end_layout
 817
 818 \begin_layout Standard
 819 Samba uses a wrapper which hands out the same tdb_context to multiple callers
 820  if this happens, and does simple reference counting.
 821  We should do this inside the tdb library, which already emulates lock nesting
 822  internally; it would need to recognize when deadlock occurs within a single
 823  process.
 824  This would create a new failure mode for tdb operations (while we currently
 825  handle locking failures, they are impossible in normal use and a process
 826  encountering them can do little but give up).
 827 \end_layout
 828
 829 \begin_layout Standard
 830 I do not see benefit in an additional tdb_open flag to indicate whether
 831  re-opening is allowed, as though there may be some benefit to adding a
 832  call to detect when a tdb_context is shared, to allow other to create such
 833  an API.
 834 \end_layout
 835
 836 \begin_layout Subsection
 837 TDB API Is Not POSIX Thread-safe
 838 \end_layout
 839
 840 \begin_layout Standard
 841 The TDB API uses an error code which can be queried after an operation to
 842  determine what went wrong.
 843  This programming model does not work with threads, unless specific additional
 844  guarantees are given by the implementation.
 845  In addition, even otherwise-independent threads cannot open the same TDB
 846  (as in
 847 \begin_inset CommandInset ref
 848 LatexCommand ref
 849 reference "TDB-Files-Cannot"
 850
 851 \end_inset
 852
 853 ).
 854 \end_layout
 855
 856 \begin_layout Subsubsection
 857 Proposed Solution
 858 \end_layout
 859
 860 \begin_layout Standard
 861 Reachitecting the API to include a tdb_errcode pointer would be a great
 862  deal of churn; we are better to guarantee that the tdb_errcode is per-thread
 863  so the current programming model can be maintained.
 864 \end_layout
 865
 866 \begin_layout Standard
 867 This requires dynamic per-thread allocations, which is awkward with POSIX
 868  threads (pthread_key_create space is limited and we cannot simply allocate
 869  a key for every TDB).
 870 \end_layout
 871
 872 \begin_layout Standard
 873 Internal locking is required to make sure that fcntl locks do not overlap
 874  between threads, and also that the global list of tdbs is maintained.
 875 \end_layout
 876
 877 \begin_layout Standard
 878 The aim is that building tdb with -DTDB_PTHREAD will result in a pthread-safe
 879  version of the library, and otherwise no overhead will exist.
 880 \end_layout
 881
 882 \begin_layout Subsection
 883 *_nonblock Functions And *_mark Functions Expose Implementation
 884 \end_layout
 885
 886 \begin_layout Standard
 887 CTDB
 888 \begin_inset Foot
 889 status collapsed
 890
 891 \begin_layout Plain Layout
 892 Clustered TDB, see http://ctdb.samba.org
 893 \end_layout
 894
 895 \end_inset
 896
 897  wishes to operate on TDB in a non-blocking manner.
 898  This is currently done as follows:
 899 \end_layout
 900
 901 \begin_layout Enumerate
 902 Call the _nonblock variant of an API function (eg.
 903  tdb_lockall_nonblock).
 904  If this fails:
 905 \end_layout
 906
 907 \begin_layout Enumerate
 908 Fork a child process, and wait for it to call the normal variant (eg.
 909  tdb_lockall).
 910 \end_layout
 911
 912 \begin_layout Enumerate
 913 If the child succeeds, call the _mark variant to indicate we already have
 914  the locks (eg.
 915  tdb_lockall_mark).
 916 \end_layout
 917
 918 \begin_layout Enumerate
 919 Upon completion, tell the child to release the locks (eg.
 920  tdb_unlockall).
 921 \end_layout
 922
 923 \begin_layout Enumerate
 924 Indicate to tdb that it should consider the locks removed (eg.
 925  tdb_unlockall_mark).
 926 \end_layout
 927
 928 \begin_layout Standard
 929 There are several issues with this approach.
 930  Firstly, adding two new variants of each function clutters the API for
 931  an obscure use, and so not all functions have three variants.
 932  Secondly, it assumes that all paths of the functions ask for the same locks,
 933  otherwise the parent process will have to get a lock which the child doesn't
 934  have under some circumstances.
 935  I don't believe this is currently the case, but it constrains the implementatio
 936 n.
 937
 938 \end_layout
 939
 940 \begin_layout Subsubsection
 941 \begin_inset CommandInset label
 942 LatexCommand label
 943 name "Proposed-Solution-locking-hook"
 944
 945 \end_inset
 946
 947 Proposed Solution
 948 \end_layout
 949
 950 \begin_layout Standard
 951 Implement a hook for locking methods, so that the caller can control the
 952  calls to create and remove fcntl locks.
 953  In this scenario, ctdbd would operate as follows:
 954 \end_layout
 955
 956 \begin_layout Enumerate
 957 Call the normal API function, eg tdb_lockall().
 958 \end_layout
 959
 960 \begin_layout Enumerate
 961 When the lock callback comes in, check if the child has the lock.
 962  Initially, this is always false.
 963  If so, return 0.
 964  Otherwise, try to obtain it in non-blocking mode.
 965  If that fails, return EWOULDBLOCK.
 966 \end_layout
 967
 968 \begin_layout Enumerate
 969 Release locks in the unlock callback as normal.
 970 \end_layout
 971
 972 \begin_layout Enumerate
 973 If tdb_lockall() fails, see if we recorded a lock failure; if so, call the
 974  child to repeat the operation.
 975 \end_layout
 976
 977 \begin_layout Enumerate
 978 The child records what locks it obtains, and returns that information to
 979  the parent.
 980 \end_layout
 981
 982 \begin_layout Enumerate
 983 When the child has succeeded, goto 1.
 984 \end_layout
 985
 986 \begin_layout Standard
 987 This is flexible enough to handle any potential locking scenario, even when
 988  lock requirements change.
 989  It can be optimized so that the parent does not release locks, just tells
 990  the child which locks it doesn't need to obtain.
 991 \end_layout
 992
 993 \begin_layout Standard
 994 It also keeps the complexity out of the API, and in ctdbd where it is needed.
 995 \end_layout
 996
 997 \begin_layout Subsection
 998 tdb_chainlock Functions Expose Implementation
 999 \end_layout
1000
1001 \begin_layout Standard
1002 tdb_chainlock locks some number of records, including the record indicated
1003  by the given key.
1004  This gave atomicity guarantees; no-one can start a transaction, alter,
1005  read or delete that key while the lock is held.
1006 \end_layout
1007
1008 \begin_layout Standard
1009 It also makes the same guarantee for any other key in the chain, which is
1010  an internal implementation detail and potentially a cause for deadlock.
1011 \end_layout
1012
1013 \begin_layout Subsubsection
1014 Proposed Solution
1015 \end_layout
1016
1017 \begin_layout Standard
1018 None.
1019  It would be nice to have an explicit single entry lock which effected no
1020  other keys.
1021  Unfortunately, this won't work for an entry which doesn't exist.
1022  Thus while chainlock may be implemented more efficiently for the existing
1023  case, it will still have overlap issues with the non-existing case.
1024  So it is best to keep the current (lack of) guarantee about which records
1025  will be effected to avoid constraining our implementation.
1026 \end_layout
1027
1028 \begin_layout Subsection
1029 Signal Handling is Not Race-Free
1030 \end_layout
1031
1032 \begin_layout Standard
1033 The tdb_setalarm_sigptr() call allows the caller's signal handler to indicate
1034  that the tdb locking code should return with a failure, rather than trying
1035  again when a signal is received (and errno == EAGAIN).
1036  This is usually used to implement timeouts.
1037 \end_layout
1038
1039 \begin_layout Standard
1040 Unfortunately, this does not work in the case where the signal is received
1041  before the tdb code enters the fcntl() call to place the lock: the code
1042  will sleep within the fcntl() code, unaware that the signal wants it to
1043  exit.
1044  In the case of long timeouts, this does not happen in practice.
1045 \end_layout
1046
1047 \begin_layout Subsubsection
1048 Proposed Solution
1049 \end_layout
1050
1051 \begin_layout Standard
1052 The locking hooks proposed in
1053 \begin_inset CommandInset ref
1054 LatexCommand ref
1055 reference "Proposed-Solution-locking-hook"
1056
1057 \end_inset
1058
1059  would allow the user to decide on whether to fail the lock acquisition
1060  on a signal.
1061  This allows the caller to choose their own compromise: they could narrow
1062  the race by checking immediately before the fcntl call.
1063 \begin_inset Foot
1064 status collapsed
1065
1066 \begin_layout Plain Layout
1067 It may be possible to make this race-free in some implementations by having
1068  the signal handler alter the struct flock to make it invalid.
1069  This will cause the fcntl() lock call to fail with EINVAL if the signal
1070  occurs before the kernel is entered, otherwise EAGAIN.
1071 \end_layout
1072
1073 \end_inset
1074
1075
1076 \end_layout
1077
1078 \begin_layout Subsection
1079 The API Uses Gratuitous Typedefs, Capitals
1080 \end_layout
1081
1082 \begin_layout Standard
1083 typedefs are useful for providing source compatibility when types can differ
1084  across implementations, or arguably in the case of function pointer definitions
1085  which are hard for humans to parse.
1086  Otherwise it is simply obfuscation and pollutes the namespace.
1087 \end_layout
1088
1089 \begin_layout Standard
1090 Capitalization is usually reserved for compile-time constants and macros.
1091 \end_layout
1092
1093 \begin_layout Description
1094 TDB_CONTEXT There is no reason to use this over 'struct tdb_context'; the
1095  definition isn't visible to the API user anyway.
1096 \end_layout
1097
1098 \begin_layout Description
1099 TDB_DATA There is no reason to use this over struct TDB_DATA; the struct
1100  needs to be understood by the API user.
1101 \end_layout
1102
1103 \begin_layout Description
1104 struct
1105 \begin_inset space ~
1106 \end_inset
1107
1108 TDB_DATA This would normally be called 'struct tdb_data'.
1109 \end_layout
1110
1111 \begin_layout Description
1112 enum
1113 \begin_inset space ~
1114 \end_inset
1115
1116 TDB_ERROR Similarly, this would normally be enum tdb_error.
1117 \end_layout
1118
1119 \begin_layout Subsubsection
1120 Proposed Solution
1121 \end_layout
1122
1123 \begin_layout Standard
1124 None.
1125  Introducing lower case variants would please pedants like myself, but if
1126  it were done the existing ones should be kept.
1127  There is little point forcing a purely cosmetic change upon tdb users.
1128 \end_layout
1129
1130 \begin_layout Subsection
1131 \begin_inset CommandInset label
1132 LatexCommand label
1133 name "tdb_log_func-Doesnt-Take"
1134
1135 \end_inset
1136
1137 tdb_log_func Doesn't Take The Private Pointer
1138 \end_layout
1139
1140 \begin_layout Standard
1141 For API compatibility reasons, the logging function needs to call tdb_get_loggin
1142 g_private() to retrieve the pointer registered by the tdb_open_ex for logging.
1143 \end_layout
1144
1145 \begin_layout Subsubsection
1146 Proposed Solution
1147 \end_layout
1148
1149 \begin_layout Standard
1150 It should simply take an extra argument, since we are prepared to break
1151  the API/ABI.
1152 \end_layout
1153
1154 \begin_layout Subsection
1155 Various Callback Functions Are Not Typesafe
1156 \end_layout
1157
1158 \begin_layout Standard
1159 The callback functions in tdb_set_logging_function (after
1160 \begin_inset CommandInset ref
1161 LatexCommand ref
1162 reference "tdb_log_func-Doesnt-Take"
1163
1164 \end_inset
1165
1166  is resolved), tdb_parse_record, tdb_traverse, tdb_traverse_read and tdb_check
1167  all take void * and must internally convert it to the argument type they
1168  were expecting.
1169 \end_layout
1170
1171 \begin_layout Standard
1172 If this type changes, the compiler will not produce warnings on the callers,
1173  since it only sees void *.
1174 \end_layout
1175
1176 \begin_layout Subsubsection
1177 Proposed Solution
1178 \end_layout
1179
1180 \begin_layout Standard
1181 With careful use of macros, we can create callback functions which give
1182  a warning when used on gcc and the types of the callback and its private
1183  argument differ.
1184  Unsupported compilers will not give a warning, which is no worse than now.
1185  In addition, the callbacks become clearer, as they need not use void *
1186  for their parameter.
1187 \end_layout
1188
1189 \begin_layout Standard
1190 See CCAN's typesafe_cb module at http://ccan.ozlabs.org/info/typesafe_cb.html
1191 \end_layout
1192
1193 \begin_layout Subsection
1194 TDB_CLEAR_IF_FIRST Must Be Specified On All Opens, tdb_reopen_all Problematic
1195 \end_layout
1196
1197 \begin_layout Standard
1198 The TDB_CLEAR_IF_FIRST flag to tdb_open indicates that the TDB file should
1199  be cleared if the caller discovers it is the only process with the TDB
1200  open.
1201  However, if any caller does not specify TDB_CLEAR_IF_FIRST it will not
1202  be detected, so will have the TDB erased underneath them (usually resulting
1203  in a crash).
1204 \end_layout
1205
1206 \begin_layout Standard
1207 There is a similar issue on fork(); if the parent exits (or otherwise closes
1208  the tdb) before the child calls tdb_reopen_all() to establish the lock
1209  used to indicate the TDB is opened by someone, a TDB_CLEAR_IF_FIRST opener
1210  at that moment will believe it alone has opened the TDB and will erase
1211  it.
1212 \end_layout
1213
1214 \begin_layout Subsubsection
1215 Proposed Solution
1216 \end_layout
1217
1218 \begin_layout Standard
1219 Remove TDB_CLEAR_IF_FIRST.
1220  Other workarounds are possible, but see
1221 \begin_inset CommandInset ref
1222 LatexCommand ref
1223 reference "TDB_CLEAR_IF_FIRST-Imposes-Performance"
1224
1225 \end_inset
1226
1227 .
1228 \end_layout
1229
1230 \begin_layout Section
1231 Performance And Scalability Issues
1232 \end_layout
1233
1234 \begin_layout Subsection
1235 \begin_inset CommandInset label
1236 LatexCommand label
1237 name "TDB_CLEAR_IF_FIRST-Imposes-Performance"
1238
1239 \end_inset
1240
1241 TDB_CLEAR_IF_FIRST Imposes Performance Penalty
1242 \end_layout
1243
1244 \begin_layout Standard
1245 When TDB_CLEAR_IF_FIRST is specified, a 1-byte read lock is placed at offset
1246  4 (aka.
1247  the ACTIVE_LOCK).
1248  While these locks never conflict in normal tdb usage, they do add substantial
1249  overhead for most fcntl lock implementations when the kernel scans to detect
1250  if a lock conflict exists.
1251  This is often a single linked list, making the time to acquire and release
1252  a fcntl lock O(N) where N is the number of processes with the TDB open,
1253  not the number actually doing work.
1254 \end_layout
1255
1256 \begin_layout Standard
1257 In a Samba server it is common to have huge numbers of clients sitting idle,
1258  and thus they have weaned themselves off the TDB_CLEAR_IF_FIRST flag.
1259 \begin_inset Foot
1260 status collapsed
1261
1262 \begin_layout Plain Layout
1263 There is a flag to tdb_reopen_all() which is used for this optimization:
1264  if the parent process will outlive the child, the child does not need the
1265  ACTIVE_LOCK.
1266  This is a workaround for this very performance issue.
1267 \end_layout
1268
1269 \end_inset
1270
1271
1272 \end_layout
1273
1274 \begin_layout Subsubsection
1275 Proposed Solution
1276 \end_layout
1277
1278 \begin_layout Standard
1279 Remove the flag.
1280  It was a neat idea, but even trivial servers tend to know when they are
1281  initializing for the first time and can simply unlink the old tdb at that
1282  point.
1283 \end_layout
1284
1285 \begin_layout Subsection
1286 TDB Files Have a 4G Limit
1287 \end_layout
1288
1289 \begin_layout Standard
1290 This seems to be becoming an issue (so much for
1291 \begin_inset Quotes eld
1292 \end_inset
1293
1294 trivial
1295 \begin_inset Quotes erd
1296 \end_inset
1297
1298 !), particularly for ldb.
1299 \end_layout
1300
1301 \begin_layout Subsubsection
1302 Proposed Solution
1303 \end_layout
1304
1305 \begin_layout Standard
1306 A new, incompatible TDB format which uses 64 bit offsets internally rather
1307  than 32 bit as now.
1308  For simplicity of endian conversion (which TDB does on the fly if required),
1309  all values will be 64 bit on disk.
1310  In practice, some upper bits may be used for other purposes, but at least
1311  56 bits will be available for file offsets.
1312 \end_layout
1313
1314 \begin_layout Standard
1315 tdb_open() will automatically detect the old version, and even create them
1316  if TDB_VERSION6 is specified to tdb_open.
1317 \end_layout
1318
1319 \begin_layout Standard
1320 32 bit processes will still be able to access TDBs larger than 4G (assuming
1321  that their off_t allows them to seek to 64 bits), they will gracefully
1322  fall back as they fail to mmap.
1323  This can happen already with large TDBs.
1324 \end_layout
1325
1326 \begin_layout Standard
1327 Old versions of tdb will fail to open the new TDB files (since 28 August
1328  2009, commit 398d0c29290: prior to that any unrecognized file format would
1329  be erased and initialized as a fresh tdb!)
1330 \end_layout
1331
1332 \begin_layout Subsection
1333 TDB Records Have a 4G Limit
1334 \end_layout
1335
1336 \begin_layout Standard
1337 This has not been a reported problem, and the API uses size_t which can
1338  be 64 bit on 64 bit platforms.
1339  However, other limits may have made such an issue moot.
1340 \end_layout
1341
1342 \begin_layout Subsubsection
1343 Proposed Solution
1344 \end_layout
1345
1346 \begin_layout Standard
1347 Record sizes will be 64 bit, with an error returned on 32 bit platforms
1348  which try to access such records (the current implementation would return
1349  TDB_ERR_OOM in a similar case).
1350  It seems unlikely that 32 bit keys will be a limitation, so the implementation
1351  may not support this (see
1352 \begin_inset CommandInset ref
1353 LatexCommand ref
1354 reference "sub:Records-Incur-A"
1355
1356 \end_inset
1357
1358 ).
1359 \end_layout
1360
1361 \begin_layout Subsection
1362 Hash Size Is Determined At TDB Creation Time
1363 \end_layout
1364
1365 \begin_layout Standard
1366 TDB contains a number of hash chains in the header; the number is specified
1367  at creation time, and defaults to 131.
1368  This is such a bottleneck on large databases (as each hash chain gets quite
1369  long), that LDB uses 10,000 for this hash.
1370  In general it is impossible to know what the 'right' answer is at database
1371  creation time.
1372 \end_layout
1373
1374 \begin_layout Subsubsection
1375 Proposed Solution
1376 \end_layout
1377
1378 \begin_layout Standard
1379 After comprehensive performance testing on various scalable hash variants
1380 \begin_inset Foot
1381 status collapsed
1382
1383 \begin_layout Plain Layout
1384 http://rusty.ozlabs.org/?p=89 and http://rusty.ozlabs.org/?p=94 This was annoying
1385  because I was previously convinced that an expanding tree of hashes would
1386  be very close to optimal.
1387 \end_layout
1388
1389 \end_inset
1390
1391 , it became clear that it is hard to beat a straight linear hash table which
1392  doubles in size when it reaches saturation.
1393  There are three details which become important:
1394 \end_layout
1395
1396 \begin_layout Enumerate
1397 On encountering a full bucket, we use the next bucket.
1398 \end_layout
1399
1400 \begin_layout Enumerate
1401 Extra hash bits are stored with the offset, to reduce comparisons.
1402 \end_layout
1403
1404 \begin_layout Enumerate
1405 A marker entry is used on deleting an entry.
1406 \end_layout
1407
1408 \begin_layout Standard
1409 The doubling of the table must be done under a transaction; we will not
1410  reduce it on deletion, so it will be an unusual case.
1411  It will either be placed at the head (other entries will be moved out the
1412  way so we can expand).
1413  We could have a pointer in the header to the current hashtable location,
1414  but that pointer would have to be read frequently to check for hashtable
1415  moves.
1416 \end_layout
1417
1418 \begin_layout Standard
1419 The locking for this is slightly more complex than the chained case; we
1420  currently have one lock per bucket, and that means we would need to expand
1421  the lock if we overflow to the next bucket.
1422  The frequency of such collisions will effect our locking heuristics: we
1423  can always lock more buckets than we need.
1424 \end_layout
1425
1426 \begin_layout Standard
1427 One possible optimization is to only re-check the hash size on an insert
1428  or a lookup miss.
1429 \end_layout
1430
1431 \begin_layout Subsection
1432 \begin_inset CommandInset label
1433 LatexCommand label
1434 name "TDB-Freelist-Is"
1435
1436 \end_inset
1437
1438 TDB Freelist Is Highly Contended
1439 \end_layout
1440
1441 \begin_layout Standard
1442 TDB uses a single linked list for the free list.
1443  Allocation occurs as follows, using heuristics which have evolved over
1444  time:
1445 \end_layout
1446
1447 \begin_layout Enumerate
1448 Get the free list lock for this whole operation.
1449 \end_layout
1450
1451 \begin_layout Enumerate
1452 Multiply length by 1.25, so we always over-allocate by 25%.
1453 \end_layout
1454
1455 \begin_layout Enumerate
1456 Set the slack multiplier to 1.
1457 \end_layout
1458
1459 \begin_layout Enumerate
1460 Examine the current freelist entry: if it is > length but < the current
1461  best case, remember it as the best case.
1462 \end_layout
1463
1464 \begin_layout Enumerate
1465 Multiply the slack multiplier by 1.05.
1466 \end_layout
1467
1468 \begin_layout Enumerate
1469 If our best fit so far is less than length * slack multiplier, return it.
1470  The slack will be turned into a new free record if it's large enough.
1471 \end_layout
1472
1473 \begin_layout Enumerate
1474 Otherwise, go onto the next freelist entry.
1475 \end_layout
1476
1477 \begin_layout Standard
1478 Deleting a record occurs as follows:
1479 \end_layout
1480
1481 \begin_layout Enumerate
1482 Lock the hash chain for this whole operation.
1483 \end_layout
1484
1485 \begin_layout Enumerate
1486 Walk the chain to find the record, keeping the prev pointer offset.
1487 \end_layout
1488
1489 \begin_layout Enumerate
1490 If max_dead is non-zero:
1491 \end_layout
1492
1493 \begin_deeper
1494 \begin_layout Enumerate
1495 Walk the hash chain again and count the dead records.
1496 \end_layout
1497
1498 \begin_layout Enumerate
1499 If it's more than max_dead, bulk free all the dead ones (similar to steps
1500  4 and below, but the lock is only obtained once).
1501 \end_layout
1502
1503 \begin_layout Enumerate
1504 Simply mark this record as dead and return.
1505
1506 \end_layout
1507
1508 \end_deeper
1509 \begin_layout Enumerate
1510 Get the free list lock for the remainder of this operation.
1511 \end_layout
1512
1513 \begin_layout Enumerate
1514 \begin_inset CommandInset label
1515 LatexCommand label
1516 name "right-merging"
1517
1518 \end_inset
1519
1520 Examine the following block to see if it is free; if so, enlarge the current
1521  block and remove that block from the free list.
1522  This was disabled, as removal from the free list was O(entries-in-free-list).
1523 \end_layout
1524
1525 \begin_layout Enumerate
1526 Examine the preceeding block to see if it is free: for this reason, each
1527  block has a 32-bit tailer which indicates its length.
1528  If it is free, expand it to cover our new block and return.
1529 \end_layout
1530
1531 \begin_layout Enumerate
1532 Otherwise, prepend ourselves to the free list.
1533 \end_layout
1534
1535 \begin_layout Standard
1536 Disabling right-merging (step
1537 \begin_inset CommandInset ref
1538 LatexCommand ref
1539 reference "right-merging"
1540
1541 \end_inset
1542
1543 ) causes fragmentation; the other heuristics proved insufficient to address
1544  this, so the final answer to this was that when we expand the TDB file
1545  inside a transaction commit, we repack the entire tdb.
1546 \end_layout
1547
1548 \begin_layout Standard
1549 The single list lock limits our allocation rate; due to the other issues
1550  this is not currently seen as a bottleneck.
1551 \end_layout
1552
1553 \begin_layout Subsubsection
1554 Proposed Solution
1555 \end_layout
1556
1557 \begin_layout Standard
1558 The first step is to remove all the current heuristics, as they obviously
1559  interact, then examine them once the lock contention is addressed.
1560 \end_layout
1561
1562 \begin_layout Standard
1563 The free list must be split to reduce contention.
1564  Assuming perfect free merging, we can at most have 1 free list entry for
1565  each entry.
1566  This implies that the number of free lists is related to the size of the
1567  hash table, but as it is rare to walk a large number of free list entries
1568  we can use far fewer, say 1/32 of the number of hash buckets.
1569 \end_layout
1570
1571 \begin_layout Standard
1572 There are various benefits in using per-size free lists (see
1573 \begin_inset CommandInset ref
1574 LatexCommand ref
1575 reference "sub:TDB-Becomes-Fragmented"
1576
1577 \end_inset
1578
1579 ) but it's not clear this would reduce contention in the common case where
1580  all processes are allocating/freeing the same size.
1581  Thus we almost certainly need to divide in other ways: the most obvious
1582  is to divide the file into zones, and using a free list (or set of free
1583  lists) for each.
1584  This approximates address ordering.
1585 \end_layout
1586
1587 \begin_layout Standard
1588 Note that this means we need to split the free lists when we expand the
1589  file; this is probably acceptable when we double the hash table size, since
1590  that is such an expensive operation already.
1591  In the case of increasing the file size, there is an optimization we can
1592  use: if we use M in the formula above as the file size rounded up to the
1593  next power of 2, we only need reshuffle free lists when the file size crosses
1594  a power of 2 boundary,
1595 \emph on
1596 and
1597 \emph default
1598 reshuffling the free lists is trivial: we simply merge every consecutive
1599  pair of free lists.
1600 \end_layout
1601
1602 \begin_layout Standard
1603 The basic algorithm is as follows.
1604  Freeing is simple:
1605 \end_layout
1606
1607 \begin_layout Enumerate
1608 Identify the correct zone.
1609 \end_layout
1610
1611 \begin_layout Enumerate
1612 Lock the corresponding list.
1613 \end_layout
1614
1615 \begin_layout Enumerate
1616 Re-check the zone (we didn't have a lock, sizes could have changed): relock
1617  if necessary.
1618 \end_layout
1619
1620 \begin_layout Enumerate
1621 Place the freed entry in the list for that zone.
1622 \end_layout
1623
1624 \begin_layout Standard
1625 Allocation is a little more complicated, as we perform delayed coalescing
1626  at this point:
1627 \end_layout
1628
1629 \begin_layout Enumerate
1630 Pick a zone either the zone we last freed into, or based on a
1631 \begin_inset Quotes eld
1632 \end_inset
1633
1634 random
1635 \begin_inset Quotes erd
1636 \end_inset
1637
1638  number.
1639 \end_layout
1640
1641 \begin_layout Enumerate
1642 Lock the corresponding list.
1643 \end_layout
1644
1645 \begin_layout Enumerate
1646 Re-check the zone: relock if necessary.
1647 \end_layout
1648
1649 \begin_layout Enumerate
1650 If the top entry is -large enough, remove it from the list and return it.
1651 \end_layout
1652
1653 \begin_layout Enumerate
1654 Otherwise, coalesce entries in the list.If there was no entry large enough,
1655  unlock the list and try the next zone.
1656 \end_layout
1657
1658 \begin_layout Enumerate
1659 If no zone satisfies, expand the file.
1660 \end_layout
1661
1662 \begin_layout Standard
1663 This optimizes rapid insert/delete of free list entries by not coalescing
1664  them all the time..
1665  First-fit address ordering ordering seems to be fairly good for keeping
1666  fragmentation low (see
1667 \begin_inset CommandInset ref
1668 LatexCommand ref
1669 reference "sub:TDB-Becomes-Fragmented"
1670
1671 \end_inset
1672
1673 ).
1674  Note that address ordering does not need a tailer to coalesce, though if
1675  we needed one we could have one cheaply: see
1676 \begin_inset CommandInset ref
1677 LatexCommand ref
1678 reference "sub:Records-Incur-A"
1679
1680 \end_inset
1681
1682 .
1683
1684 \end_layout
1685
1686 \begin_layout Standard
1687 I anticipate that the number of entries in each free zone would be small,
1688  but it might be worth using one free entry to hold pointers to the others
1689  for cache efficiency.
1690 \end_layout
1691
1692 \begin_layout Subsection
1693 \begin_inset CommandInset label
1694 LatexCommand label
1695 name "sub:TDB-Becomes-Fragmented"
1696
1697 \end_inset
1698
1699 TDB Becomes Fragmented
1700 \end_layout
1701
1702 \begin_layout Standard
1703 Much of this is a result of allocation strategy
1704 \begin_inset Foot
1705 status collapsed
1706
1707 \begin_layout Plain Layout
1708 The Memory Fragmentation Problem: Solved? Johnstone & Wilson 1995 ftp://ftp.cs.ute
1709 xas.edu/pub/garbage/malloc/ismm98.ps
1710 \end_layout
1711
1712 \end_inset
1713
1714  and deliberate hobbling of coalescing; internal fragmentation (aka overallocati
1715 on) is deliberately set at 25%, and external fragmentation is only cured
1716  by the decision to repack the entire db when a transaction commit needs
1717  to enlarge the file.
1718 \end_layout
1719
1720 \begin_layout Subsubsection
1721 Proposed Solution
1722 \end_layout
1723
1724 \begin_layout Standard
1725 The 25% overhead on allocation works in practice for ldb because indexes
1726  tend to expand by one record at a time.
1727  This internal fragmentation can be resolved by having an
1728 \begin_inset Quotes eld
1729 \end_inset
1730
1731 expanded
1732 \begin_inset Quotes erd
1733 \end_inset
1734
1735  bit in the header to note entries that have previously expanded, and allocating
1736  more space for them.
1737 \end_layout
1738
1739 \begin_layout Standard
1740 There are is a spectrum of possible solutions for external fragmentation:
1741  one is to use a fragmentation-avoiding allocation strategy such as best-fit
1742  address-order allocator.
1743  The other end of the spectrum would be to use a bump allocator (very fast
1744  and simple) and simply repack the file when we reach the end.
1745 \end_layout
1746
1747 \begin_layout Standard
1748 There are three problems with efficient fragmentation-avoiding allocators:
1749  they are non-trivial, they tend to use a single free list for each size,
1750  and there's no evidence that tdb allocation patterns will match those recorded
1751  for general allocators (though it seems likely).
1752 \end_layout
1753
1754 \begin_layout Standard
1755 Thus we don't spend too much effort on external fragmentation; we will be
1756  no worse than the current code if we need to repack on occasion.
1757  More effort is spent on reducing freelist contention, and reducing overhead.
1758 \end_layout
1759
1760 \begin_layout Subsection
1761 \begin_inset CommandInset label
1762 LatexCommand label
1763 name "sub:Records-Incur-A"
1764
1765 \end_inset
1766
1767 Records Incur A 28-Byte Overhead
1768 \end_layout
1769
1770 \begin_layout Standard
1771 Each TDB record has a header as follows:
1772 \end_layout
1773
1774 \begin_layout LyX-Code
1775 struct tdb_record {
1776 \end_layout
1777
1778 \begin_layout LyX-Code
1779         tdb_off_t next; /* offset of the next record in the list */
1780 \end_layout
1781
1782 \begin_layout LyX-Code
1783         tdb_len_t rec_len; /* total byte length of record */
1784 \end_layout
1785
1786 \begin_layout LyX-Code
1787         tdb_len_t key_len; /* byte length of key */
1788 \end_layout
1789
1790 \begin_layout LyX-Code
1791         tdb_len_t data_len; /* byte length of data */
1792 \end_layout
1793
1794 \begin_layout LyX-Code
1795         uint32_t full_hash; /* the full 32 bit hash of the key */
1796 \end_layout
1797
1798 \begin_layout LyX-Code
1799         uint32_t magic;   /* try to catch errors */
1800 \end_layout
1801
1802 \begin_layout LyX-Code
1803         /* the following union is implied:
1804 \end_layout
1805
1806 \begin_layout LyX-Code
1807                 union {
1808 \end_layout
1809
1810 \begin_layout LyX-Code
1811                         char record[rec_len];
1812 \end_layout
1813
1814 \begin_layout LyX-Code
1815                         struct {
1816 \end_layout
1817
1818 \begin_layout LyX-Code
1819                                 char key[key_len];
1820 \end_layout
1821
1822 \begin_layout LyX-Code
1823                                 char data[data_len];
1824 \end_layout
1825
1826 \begin_layout LyX-Code
1827                         }
1828 \end_layout
1829
1830 \begin_layout LyX-Code
1831                         uint32_t totalsize; (tailer)
1832 \end_layout
1833
1834 \begin_layout LyX-Code
1835                 }
1836 \end_layout
1837
1838 \begin_layout LyX-Code
1839         */
1840 \end_layout
1841
1842 \begin_layout LyX-Code
1843 };
1844 \end_layout
1845
1846 \begin_layout Standard
1847 Naively, this would double to a 56-byte overhead on a 64 bit implementation.
1848 \end_layout
1849
1850 \begin_layout Subsubsection
1851 Proposed Solution
1852 \end_layout
1853
1854 \begin_layout Standard
1855 We can use various techniques to reduce this for an allocated block:
1856 \end_layout
1857
1858 \begin_layout Enumerate
1859 The 'next' pointer is not required, as we are using a flat hash table.
1860 \end_layout
1861
1862 \begin_layout Enumerate
1863 'rec_len' can instead be expressed as an addition to key_len and data_len
1864  (it accounts for wasted or overallocated length in the record).
1865  Since the record length is always a multiple of 8, we can conveniently
1866  fit it in 32 bits (representing up to 35 bits).
1867 \end_layout
1868
1869 \begin_layout Enumerate
1870 'key_len' and 'data_len' can be reduced.
1871  I'm unwilling to restrict 'data_len' to 32 bits, but instead we can combine
1872  the two into one 64-bit field and using a 5 bit value which indicates at
1873  what bit to divide the two.
1874  Keys are unlikely to scale as fast as data, so I'm assuming a maximum key
1875  size of 32 bits.
1876 \end_layout
1877
1878 \begin_layout Enumerate
1879 'full_hash' is used to avoid a memcmp on the
1880 \begin_inset Quotes eld
1881 \end_inset
1882
1883 miss
1884 \begin_inset Quotes erd
1885 \end_inset
1886
1887  case, but this is diminishing returns after a handful of bits (at 10 bits,
1888  it reduces 99.9% of false memcmp).
1889  As an aside, as the lower bits are already incorporated in the hash table
1890  resolution, the upper bits should be used here.
1891 \end_layout
1892
1893 \begin_layout Enumerate
1894 'magic' does not need to be enlarged: it currently reflects one of 5 values
1895  (used, free, dead, recovery, and unused_recovery).
1896  It is useful for quick sanity checking however, and should not be eliminated.
1897 \end_layout
1898
1899 \begin_layout Enumerate
1900 'tailer' is only used to coalesce free blocks (so a block to the right can
1901  find the header to check if this block is free).
1902  This can be replaced by a single 'free' bit in the header of the following
1903  block (and the tailer only exists in free blocks).
1904 \begin_inset Foot
1905 status collapsed
1906
1907 \begin_layout Plain Layout
1908 This technique from Thomas Standish.
1909  Data Structure Techniques.
1910  Addison-Wesley, Reading, Massachusetts, 1980.
1911 \end_layout
1912
1913 \end_inset
1914
1915  The current proposed coalescing algorithm doesn't need this, however.
1916 \end_layout
1917
1918 \begin_layout Standard
1919 This produces a 16 byte used header like this:
1920 \end_layout
1921
1922 \begin_layout LyX-Code
1923 struct tdb_used_record {
1924 \end_layout
1925
1926 \begin_layout LyX-Code
1927         uint32_t magic : 16,
1928 \end_layout
1929
1930 \begin_layout LyX-Code
1931                  prev_is_free: 1,
1932 \end_layout
1933
1934 \begin_layout LyX-Code
1935                  key_data_divide: 5,
1936 \end_layout
1937
1938 \begin_layout LyX-Code
1939                  top_hash: 10;
1940 \end_layout
1941
1942 \begin_layout LyX-Code
1943         uint32_t extra_octets;
1944 \end_layout
1945
1946 \begin_layout LyX-Code
1947         uint64_t key_and_data_len;
1948 \end_layout
1949
1950 \begin_layout LyX-Code
1951 };
1952 \end_layout
1953
1954 \begin_layout Standard
1955 And a free record like this:
1956 \end_layout
1957
1958 \begin_layout LyX-Code
1959 struct tdb_free_record {
1960 \end_layout
1961
1962 \begin_layout LyX-Code
1963         uint32_t free_magic;
1964 \end_layout
1965
1966 \begin_layout LyX-Code
1967         uint64_t total_length;
1968 \end_layout
1969
1970 \begin_layout LyX-Code
1971         ...
1972 \end_layout
1973
1974 \begin_layout LyX-Code
1975         uint64_t tailer;
1976 \end_layout
1977
1978 \begin_layout LyX-Code
1979 };
1980 \end_layout
1981
1982 \begin_layout LyX-Code
1983
1984 \end_layout
1985
1986 \begin_layout Subsection
1987 Transaction Commit Requires 4 fdatasync
1988 \end_layout
1989
1990 \begin_layout Standard
1991 The current transaction algorithm is:
1992 \end_layout
1993
1994 \begin_layout Enumerate
1995 write_recovery_data();
1996 \end_layout
1997
1998 \begin_layout Enumerate
1999 sync();
2000 \end_layout
2001
2002 \begin_layout Enumerate
2003 write_recovery_header();
2004 \end_layout
2005
2006 \begin_layout Enumerate
2007 sync();
2008 \end_layout
2009
2010 \begin_layout Enumerate
2011 overwrite_with_new_data();
2012 \end_layout
2013
2014 \begin_layout Enumerate
2015 sync();
2016 \end_layout
2017
2018 \begin_layout Enumerate
2019 remove_recovery_header();
2020 \end_layout
2021
2022 \begin_layout Enumerate
2023 sync();
2024 \end_layout
2025
2026 \begin_layout Standard
2027 On current ext3, each sync flushes all data to disk, so the next 3 syncs
2028  are relatively expensive.
2029  But this could become a performance bottleneck on other filesystems such
2030  as ext4.
2031 \end_layout
2032
2033 \begin_layout Subsubsection
2034 Proposed Solution
2035 \end_layout
2036
2037 \begin_layout Standard
2038 Neil Brown points out that this is overzealous, and only one sync is needed:
2039 \end_layout
2040
2041 \begin_layout Enumerate
2042 Bundle the recovery data, a transaction counter and a strong checksum of
2043  the new data.
2044 \end_layout
2045
2046 \begin_layout Enumerate
2047 Strong checksum that whole bundle.
2048 \end_layout
2049
2050 \begin_layout Enumerate
2051 Store the bundle in the database.
2052 \end_layout
2053
2054 \begin_layout Enumerate
2055 Overwrite the oldest of the two recovery pointers in the header (identified
2056  using the transaction counter) with the offset of this bundle.
2057 \end_layout
2058
2059 \begin_layout Enumerate
2060 sync.
2061 \end_layout
2062
2063 \begin_layout Enumerate
2064 Write the new data to the file.
2065 \end_layout
2066
2067 \begin_layout Standard
2068 Checking for recovery means identifying the latest bundle with a valid checksum
2069  and using the new data checksum to ensure that it has been applied.
2070  This is more expensive than the current check, but need only be done at
2071  open.
2072  For running databases, a separate header field can be used to indicate
2073  a transaction in progress; we need only check for recovery if this is set.
2074 \end_layout
2075
2076 \begin_layout Subsection
2077 \begin_inset CommandInset label
2078 LatexCommand label
2079 name "sub:TDB-Does-Not"
2080
2081 \end_inset
2082
2083 TDB Does Not Have Snapshot Support
2084 \end_layout
2085
2086 \begin_layout Subsubsection
2087 Proposed Solution
2088 \end_layout
2089
2090 \begin_layout Standard
2091 None.
2092  At some point you say
2093 \begin_inset Quotes eld
2094 \end_inset
2095
2096 use a real database
2097 \begin_inset Quotes erd
2098 \end_inset
2099
2100 .
2101 \end_layout
2102
2103 \begin_layout Standard
2104 But as a thought experiment, if we implemented transactions to only overwrite
2105  free entries (this is tricky: there must not be a header in each entry
2106  which indicates whether it is free, but use of presence in metadata elsewhere),
2107  and a pointer to the hash table, we could create an entirely new commit
2108  without destroying existing data.
2109  Then it would be easy to implement snapshots in a similar way.
2110 \end_layout
2111
2112 \begin_layout Standard
2113 This would not allow arbitrary changes to the database, such as tdb_repack
2114  does, and would require more space (since we have to preserve the current
2115  and future entries at once).
2116  If we used hash trees rather than one big hash table, we might only have
2117  to rewrite some sections of the hash, too.
2118 \end_layout
2119
2120 \begin_layout Standard
2121 We could then implement snapshots using a similar method, using multiple
2122  different hash tables/free tables.
2123 \end_layout
2124
2125 \begin_layout Subsection
2126 Transactions Cannot Operate in Parallel
2127 \end_layout
2128
2129 \begin_layout Standard
2130 This would be useless for ldb, as it hits the index records with just about
2131  every update.
2132  It would add significant complexity in resolving clashes, and cause the
2133  all transaction callers to write their code to loop in the case where the
2134  transactions spuriously failed.
2135 \end_layout
2136
2137 \begin_layout Subsubsection
2138 Proposed Solution
2139 \end_layout
2140
2141 \begin_layout Standard
2142 We could solve a small part of the problem by providing read-only transactions.
2143  These would allow one write transaction to begin, but it could not commit
2144  until all r/o transactions are done.
2145  This would require a new RO_TRANSACTION_LOCK, which would be upgraded on
2146  commit.
2147 \end_layout
2148
2149 \begin_layout Subsection
2150 Default Hash Function Is Suboptimal
2151 \end_layout
2152
2153 \begin_layout Standard
2154 The Knuth-inspired multiplicative hash used by tdb is fairly slow (especially
2155  if we expand it to 64 bits), and works best when the hash bucket size is
2156  a prime number (which also means a slow modulus).
2157  In addition, it is highly predictable which could potentially lead to a
2158  Denial of Service attack in some TDB uses.
2159 \end_layout
2160
2161 \begin_layout Subsubsection
2162 Proposed Solution
2163 \end_layout
2164
2165 \begin_layout Standard
2166 The Jenkins lookup3 hash
2167 \begin_inset Foot
2168 status open
2169
2170 \begin_layout Plain Layout
2171 http://burtleburtle.net/bob/c/lookup3.c
2172 \end_layout
2173
2174 \end_inset
2175
2176  is a fast and superbly-mixing hash.
2177  It's used by the Linux kernel and almost everything else.
2178  This has the particular properties that it takes an initial seed, and produces
2179  two 32 bit hash numbers, which we can combine into a 64-bit hash.
2180 \end_layout
2181
2182 \begin_layout Standard
2183 The seed should be created at tdb-creation time from some random source,
2184  and placed in the header.
2185  This is far from foolproof, but adds a little bit of protection against
2186  hash bombing.
2187 \end_layout
2188
2189 \begin_layout Subsection
2190 \begin_inset CommandInset label
2191 LatexCommand label
2192 name "Reliable-Traversal-Adds"
2193
2194 \end_inset
2195
2196 Reliable Traversal Adds Complexity
2197 \end_layout
2198
2199 \begin_layout Standard
2200 We lock a record during traversal iteration, and try to grab that lock in
2201  the delete code.
2202  If that grab on delete fails, we simply mark it deleted and continue onwards;
2203  traversal checks for this condition and does the delete when it moves off
2204  the record.
2205 \end_layout
2206
2207 \begin_layout Standard
2208 If traversal terminates, the dead record may be left indefinitely.
2209 \end_layout
2210
2211 \begin_layout Subsubsection
2212 Proposed Solution
2213 \end_layout
2214
2215 \begin_layout Standard
2216 Remove reliability guarantees; see
2217 \begin_inset CommandInset ref
2218 LatexCommand ref
2219 reference "traverse-Proposed-Solution"
2220
2221 \end_inset
2222
2223 .
2224 \end_layout
2225
2226 \begin_layout Subsection
2227 Fcntl Locking Adds Overhead
2228 \end_layout
2229
2230 \begin_layout Standard
2231 Placing a fcntl lock means a system call, as does removing one.
2232  This is actually one reason why transactions can be faster (everything
2233  is locked once at transaction start).
2234  In the uncontended case, this overhead can theoretically be eliminated.
2235 \end_layout
2236
2237 \begin_layout Subsubsection
2238 Proposed Solution
2239 \end_layout
2240
2241 \begin_layout Standard
2242 None.
2243 \end_layout
2244
2245 \begin_layout Standard
2246 We tried this before with spinlock support, in the early days of TDB, and
2247  it didn't make much difference except in manufactured benchmarks.
2248 \end_layout
2249
2250 \begin_layout Standard
2251 We could use spinlocks (with futex kernel support under Linux), but it means
2252  that we lose automatic cleanup when a process dies with a lock.
2253  There is a method of auto-cleanup under Linux, but it's not supported by
2254  other operating systems.
2255  We could reintroduce a clear-if-first-style lock and sweep for dead futexes
2256  on open, but that wouldn't help the normal case of one concurrent opener
2257  dying.
2258  Increasingly elaborate repair schemes could be considered, but they require
2259  an ABI change (everyone must use them) anyway, so there's no need to do
2260  this at the same time as everything else.
2261 \end_layout
2262
2263 \begin_layout Subsection
2264 Some Transactions Don't Require Durability
2265 \end_layout
2266
2267 \begin_layout Standard
2268 Volker points out that gencache uses a CLEAR_IF_FIRST tdb for normal (fast)
2269  usage, and occasionally empties the results into a transactional TDB.
2270  This kind of usage prioritizes performance over durability: as long as
2271  we are consistent, data can be lost.
2272 \end_layout
2273
2274 \begin_layout Standard
2275 This would be more neatly implemented inside tdb: a
2276 \begin_inset Quotes eld
2277 \end_inset
2278
2279 soft
2280 \begin_inset Quotes erd
2281 \end_inset
2282
2283  transaction commit (ie.
2284  syncless) which meant that data may be reverted on a crash.
2285 \end_layout
2286
2287 \begin_layout Subsubsection
2288 Proposed Solution
2289 \end_layout
2290
2291 \begin_layout Standard
2292 None.
2293 \end_layout
2294
2295 \begin_layout Standard
2296 Unfortunately any transaction scheme which overwrites old data requires
2297  a sync before that overwrite to avoid the possibility of corruption.
2298 \end_layout
2299
2300 \begin_layout Standard
2301 It seems possible to use a scheme similar to that described in
2302 \begin_inset CommandInset ref
2303 LatexCommand ref
2304 reference "sub:TDB-Does-Not"
2305
2306 \end_inset
2307
2308 ,where transactions are committed without overwriting existing data, and
2309  an array of top-level pointers were available in the header.
2310  If the transaction is
2311 \begin_inset Quotes eld
2312 \end_inset
2313
2314 soft
2315 \begin_inset Quotes erd
2316 \end_inset
2317
2318  then we would not need a sync at all: existing processes would pick up
2319  the new hash table and free list and work with that.
2320 \end_layout
2321
2322 \begin_layout Standard
2323 At some later point, a sync would allow recovery of the old data into the
2324  free lists (perhaps when the array of top-level pointers filled).
2325  On crash, tdb_open() would examine the array of top levels, and apply the
2326  transactions until it encountered an invalid checksum.
2327 \end_layout
2328
2329 \end_body
2330 \end_document
2331 @
2332
2333
2334 1.5
2335 log
2336 @Soft transaction commit
2337 @
2338 text
2339 @d38 1
2340 a38 1
2341 \author "Rusty Russell,,,"
2342 a52 4
2343
2344 \change_deleted 0 1280141199
2345 10-May-2010
2346 \change_inserted 0 1280141202
2347 a53 2
2348 \change_unchanged
2349
2350 a2028 2
2351
2352 \change_inserted 0 1280140902
2353 a2034 2
2354
2355 \change_unchanged
2356 a2212 2
2357 \change_inserted 0 1280140661
2358
2359 a2215 2
2360
2361 \change_inserted 0 1280140703
2362 a2219 2
2363
2364 \change_inserted 0 1280708312
2365 a2226 2
2366
2367 \change_inserted 0 1280708400
2368 a2239 2
2369
2370 \change_inserted 0 1280140836
2371 a2243 2
2372
2373 \change_inserted 0 1280708255
2374 a2247 2
2375
2376 \change_inserted 0 1280708374
2377 a2252 2
2378
2379 \change_inserted 0 1280141181
2380 a2274 2
2381
2382 \change_inserted 0 1280141345
2383 @
2384
2385
2386 1.4
2387 log
2388 @Merge changes
2389 @
2390 text
2391 @d38 1
2392 a38 1
2393 \author ""
2394 d53 2
2395 d56 4
2396 d2035 10
2397 d2223 84
2398 @
2399
2400
2401 1.3
2402 log
2403 @Transaction and freelist rethink.
2404 @
2405 text
2406 @d38 1
2407 a38 1
2408 \author "Rusty Russell,,,"
2409 d53 1
2410 a53 1
2411 27-April-2010
2412 d662 1
2413 a662 5
2414  behavior of disallowing
2415 \change_inserted 0 1272940179
2416 nested
2417 \change_unchanged
2418 transactions should become the default.
2419 a1210 2
2420 \change_inserted 0 1272944650
2421
2422 a1214 2
2423
2424 \change_inserted 0 1272944763
2425 a1218 2
2426 \change_unchanged
2427
2428 a1223 2
2429 \change_unchanged
2430
2431 a1301 2
2432
2433 \change_inserted 0 1273478114
2434 a1310 2
2435 \change_unchanged
2436
2437 d1515 1
2438 a1515 11
2439 The free list
2440 \change_deleted 0 1273469807
2441 should
2442 \change_inserted 0 1273469810
2443 must
2444 \change_unchanged
2445  be split
2446 \change_deleted 0 1273469815
2447 into multiple lists
2448 \change_unchanged
2449 to reduce contention.
2450 a1520 2
2451 \change_inserted 0 1273470006
2452
2453 a1523 2
2454
2455 \change_inserted 0 1273492055
2456 a1539 2
2457
2458 \change_inserted 0 1273483888
2459 a1551 2
2460 \change_unchanged
2461
2462 a1554 8
2463
2464 \change_deleted 0 1272942055
2465 There are various ways to organize these lisys, but because we want to be
2466  able to quickly identify which free list an entry is in, and reduce the
2467  number of locks required for merging, we will use zoning (eg.
2468  each free list covers some fixed fraction of the file).
2469
2470 \change_inserted 0 1273484187
2471 d1556 1
2472 a1556 7
2473
2474 \change_deleted 0 1273484194
2475 The algorithm for f
2476 \change_inserted 0 1273484194
2477 F
2478 \change_unchanged
2479 reeing is simple:
2480 d1560 1
2481 a1560 7
2482 Identify the correct
2483 \change_deleted 0 1273482856
2484 free list
2485 \change_inserted 0 1273482857
2486 zone
2487 \change_unchanged
2488 .
2489 d1564 1
2490 a1564 7
2491 Lock the
2492 \change_inserted 0 1273482895
2493 corresponding
2494 \change_unchanged
2495 list
2496 \change_inserted 0 1273482863
2497 .
2498 a1567 2
2499
2500 \change_inserted 0 1273482909
2501 d1573 1
2502 a1573 13
2503
2504 \change_deleted 0 1273482885
2505 , and p
2506 \change_inserted 0 1273482888
2507 P
2508 \change_unchanged
2509 lace the freed entry
2510 \change_deleted 0 1273492415
2511 at the head
2512 \change_inserted 0 1273492415
2513 in the list for that zone
2514 \change_unchanged
2515 .
2516 d1577 2
2517 a1578 7
2518 Allocation is a little more complicated, as we
2519 \change_deleted 0 1273483240
2520 merge entries as we walk the list:
2521 \change_inserted 0 1273484250
2522 perform delayed coalescing at this point:
2523 \change_unchanged
2524
2525 d1582 1
2526 a1582 19
2527 Pick a
2528 \change_deleted 0 1273482955
2529 free list;
2530 \change_inserted 0 1273482957
2531 zone
2532 \change_unchanged
2533  either the
2534 \change_deleted 0 1273482962
2535 list
2536 \change_inserted 0 1273482962
2537 zone
2538 \change_unchanged
2539  we last freed
2540 \change_deleted 0 1273482966
2541 o
2542 \change_inserted 0 1273482966
2543 i
2544 \change_unchanged
2545 nto, or based on a
2546 d1594 1
2547 a1594 9
2548 Lock th
2549 \change_inserted 0 1273482980
2550 e corresponding
2551 \change_deleted 0 1273482973
2552 at
2553 \change_unchanged
2554  list.
2555 \change_inserted 0 1273482982
2556
2557 a1597 2
2558
2559 \change_inserted 0 1273483084
2560 a1598 53
2561 \change_unchanged
2562
2563 \end_layout
2564
2565 \begin_layout Enumerate
2566 If the top entry is
2567 \change_deleted 0 1273492155
2568 well-sized,
2569 \change_inserted 0 1273492159
2570 -large enough,
2571 \change_unchanged
2572 remove it from the list and return it.
2573 \end_layout
2574
2575 \begin_layout Enumerate
2576 Otherwise,
2577 \change_inserted 0 1273492206
2578 coalesce entries in the list.
2579 \change_deleted 0 1273492200
2580 examine the entry to the right of it in the file.
2581  If it is free:
2582 \end_layout
2583
2584 \begin_deeper
2585 \begin_layout Enumerate
2586
2587 \change_deleted 0 1273492200
2588 If that entry is in a different list, lock that list too.
2589 \end_layout
2590
2591 \begin_layout Enumerate
2592
2593 \change_deleted 0 1273492200
2594 If we had to place a new lock, re-check that the entry is free.
2595 \end_layout
2596
2597 \begin_layout Enumerate
2598
2599 \change_deleted 0 1273492200
2600 Remove that entry from its free list and expand this entry to cover it.
2601 \end_layout
2602
2603 \begin_layout Enumerate
2604
2605 \change_deleted 0 1273485554
2606 Goto step 3.
2607 \end_layout
2608
2609 \end_deeper
2610 \begin_layout Enumerate
2611
2612 \change_inserted 0 1273485311
2613 If there was no entry large enough, unlock the list and try the next zone.
2614 d1602 1
2615 a1602 5
2616
2617 \change_deleted 0 1273483646
2618 Repeat step 3 with each entry in the list.
2619 \change_unchanged
2620
2621 d1606 2
2622 a1607 5
2623
2624 \change_deleted 0 1273483668
2625 Unlock the list and repeat step 2 with the next list.
2626 \change_unchanged
2627
2628 d1611 1
2629 a1611 7
2630 If no
2631 \change_deleted 0 1273483671
2632 list
2633 \change_inserted 0 1273483671
2634 zone
2635 \change_unchanged
2636  satisfies, expand the file.
2637 d1615 2
2638 a1616 9
2639 This optimizes rapid insert/delete of free list entries
2640 \change_inserted 0 1273485794
2641  by not coalescing them all the time.
2642 \change_deleted 0 1273483685
2643 , and allows us to get rid of the tailer altogether
2644 \change_unchanged
2645 .
2646
2647 \change_inserted 0 1273492299
2648 a1638 39
2649
2650 \change_deleted 0 1273476840
2651 The question of
2652 \begin_inset Quotes eld
2653 \end_inset
2654
2655 well-sized
2656 \begin_inset Quotes erd
2657 \end_inset
2658
2659  free entries is more difficult: the 25% overhead works in practice for
2660  ldb because indexes tend to expand by one record at a time.
2661  This can be resolved by having an
2662 \begin_inset Quotes eld
2663 \end_inset
2664
2665 expanded
2666 \begin_inset Quotes erd
2667 \end_inset
2668
2669  bit in the header to note entries that have previously expanded, and allocating
2670  more space for them.
2671  Whether the
2672 \begin_inset Quotes eld
2673 \end_inset
2674
2675 increasing slack
2676 \begin_inset Quotes erd
2677 \end_inset
2678
2679  algorithm should be implemented or first-fit used is still unknown: we
2680  will determine this once these other ideas are implemented.
2681 \change_inserted 0 1273483750
2682
2683 \end_layout
2684
2685 \begin_layout Standard
2686
2687 \change_inserted 0 1273492450
2688 a1644 2
2689
2690 \change_inserted 0 1273470441
2691 a1654 2
2692
2693 \change_inserted 0 1273476556
2694 a1659 2
2695
2696 \change_inserted 0 1273470423
2697 a1661 2
2698 \change_unchanged
2699
2700 a1672 2
2701
2702 \change_inserted 0 1273476847
2703 a1676 2
2704
2705 \change_inserted 0 1273476886
2706 a1691 2
2707
2708 \change_inserted 0 1273477233
2709 a1699 2
2710
2711 \change_inserted 0 1273477534
2712 a1706 2
2713
2714 \change_inserted 0 1273482700
2715 a1712 2
2716
2717 \change_inserted 0 1273478079
2718 a1722 2
2719
2720 \change_inserted 0 1273477839
2721 a1726 2
2722
2723 \change_inserted 0 1273477925
2724 a1730 2
2725
2726 \change_inserted 0 1273477925
2727 a1734 2
2728
2729 \change_inserted 0 1273477925
2730 a1738 2
2731
2732 \change_inserted 0 1273477925
2733 a1742 2
2734
2735 \change_inserted 0 1273477925
2736 a1746 2
2737
2738 \change_inserted 0 1273477925
2739 a1750 2
2740
2741 \change_inserted 0 1273477925
2742 a1754 2
2743
2744 \change_inserted 0 1273477925
2745 a1758 2
2746
2747 \change_inserted 0 1273477925
2748 a1762 2
2749
2750 \change_inserted 0 1273477925
2751 a1766 2
2752
2753 \change_inserted 0 1273477925
2754 a1770 2
2755
2756 \change_inserted 0 1273477925
2757 a1774 2
2758
2759 \change_inserted 0 1273477925
2760 a1778 2
2761
2762 \change_inserted 0 1273477925
2763 a1782 2
2764
2765 \change_inserted 0 1273477925
2766 a1786 2
2767
2768 \change_inserted 0 1273477925
2769 a1790 2
2770
2771 \change_inserted 0 1273477925
2772 a1794 2
2773
2774 \change_inserted 0 1273477925
2775 a1798 2
2776
2777 \change_inserted 0 1273492522
2778 a1802 2
2779
2780 \change_inserted 0 1273492530
2781 a1806 2
2782
2783 \change_inserted 0 1273492546
2784 a1810 2
2785
2786 \change_inserted 0 1273478239
2787 a1814 2
2788
2789 \change_inserted 0 1273479960
2790 a1821 2
2791
2792 \change_inserted 0 1273480265
2793 a1830 2
2794
2795 \change_inserted 0 1273480354
2796 a1845 2
2797
2798 \change_inserted 0 1273478968
2799 a1851 2
2800
2801 \change_inserted 0 1273492604
2802 a1859 2
2803
2804 \change_inserted 0 1273479572
2805 a1862 2
2806 \change_unchanged
2807
2808 a1870 2
2809
2810 \change_inserted 0 1273480282
2811 a1874 2
2812
2813 \change_inserted 0 1273478931
2814 a1878 2
2815
2816 \change_inserted 0 1273481549
2817 a1882 2
2818
2819 \change_inserted 0 1273481557
2820 a1886 2
2821
2822 \change_inserted 0 1273480307
2823 a1890 2
2824
2825 \change_inserted 0 1273480335
2826 a1894 2
2827
2828 \change_inserted 0 1273479897
2829 a1898 2
2830
2831 \change_inserted 0 1273479653
2832 a1902 2
2833
2834 \change_inserted 0 1273480371
2835 a1906 2
2836
2837 \change_inserted 0 1273480464
2838 a1910 2
2839
2840 \change_inserted 0 1273480399
2841 a1914 2
2842
2843 \change_inserted 0 1273480425
2844 a1918 2
2845
2846 \change_inserted 0 1273480453
2847 a1922 2
2848
2849 \change_inserted 0 1273480455
2850 a1926 2
2851
2852 \change_inserted 0 1273480450
2853 a1930 2
2854
2855 \change_inserted 0 1273480452
2856 a1935 2
2857 \change_inserted 0 1273478830
2858
2859 a1942 5
2860
2861 \change_deleted 0 1273481604
2862 In theory, we could get away with 2: one after we write the new data, and
2863  one to somehow atomically change over to it.
2864 \change_inserted 0 1273481632
2865 a1946 2
2866
2867 \change_inserted 0 1273481724
2868 a1950 2
2869
2870 \change_inserted 0 1273481713
2871 a1954 2
2872
2873 \change_inserted 0 1273481717
2874 a1958 2
2875
2876 \change_inserted 0 1273481730
2877 a1962 2
2878
2879 \change_inserted 0 1273481736
2880 a1966 2
2881
2882 \change_inserted 0 1273481744
2883 a1970 2
2884
2885 \change_inserted 0 1273481748
2886 a1974 2
2887
2888 \change_inserted 0 1273482185
2889 a1978 2
2890
2891 \change_inserted 0 1273482259
2892 a1989 50
2893
2894 \change_deleted 0 1273481848
2895 None.
2896  Trying to rewrite the transaction code is a separate experiment, which
2897  I encourage someone else to do.
2898  At some point you say
2899 \begin_inset Quotes eld
2900 \end_inset
2901
2902 use a real database
2903 \begin_inset Quotes erd
2904 \end_inset
2905
2906 .
2907 \end_layout
2908
2909 \begin_layout Standard
2910
2911 \change_deleted 0 1273481848
2912 But as a thought experiment:
2913 \change_unchanged
2914
2915 \end_layout
2916
2917 \begin_layout Standard
2918
2919 \change_deleted 0 1273481788
2920 Say there was a pointer in the header which said where the hash table and
2921  free list tables were, and that no blocks were labeled with whether they
2922  were free or not (it had to be derived from what list they were in).
2923  We could create new hash table and free list in some free space, and populate
2924  it as we want the post-committed state to look.
2925  Then we sync, then we switch the offset in the header, then we sync again.
2926 \end_layout
2927
2928 \begin_layout Standard
2929
2930 \change_deleted 0 1273481788
2931 This would not allow arbitrary changes to the database, such as tdb_repack
2932  does, and would require more space (since we have to preserve the current
2933  and future entries at once).
2934  If we used hash trees rather than one big hash table, we might only have
2935  to rewrite some sections of the hash, too.
2936 \change_inserted 0 1273481854
2937
2938 \end_layout
2939
2940 \begin_layout Standard
2941
2942 \change_inserted 0 1273482102
2943 a1993 2
2944
2945 \change_inserted 0 1273482061
2946 a1998 2
2947
2948 \change_inserted 0 1273482063
2949 a2002 2
2950
2951 \change_inserted 0 1273482072
2952 a2006 2
2953
2954 \change_inserted 0 1273482139
2955 a2011 2
2956
2957 \change_inserted 0 1273482364
2958 a2015 2
2959
2960 \change_inserted 0 1273482163
2961 a2019 2
2962
2963 \change_inserted 0 1273482493
2964 a2037 2
2965
2966 \change_inserted 0 1273482536
2967 a2046 2
2968 \change_unchanged
2969
2970 a2049 2
2971
2972 \change_inserted 0 1273482641
2973 a2058 2
2974
2975 \change_inserted 0 1273481827
2976 d2067 2
2977 a2068 11
2978 We could
2979 \change_inserted 0 1273481829
2980 then
2981 \change_unchanged
2982 implement snapshots using a similar method
2983 \change_deleted 0 1273481838
2984  to the above, only
2985 \change_inserted 0 1273481840
2986 ,
2987 \change_unchanged
2988  using multiple different hash tables/free tables.
2989 @
2990
2991
2992 1.2
2993 log
2994 @After first feedback (Ronnie & Volker)
2995 @
2996 text
2997 @d1314 13
2998 d1531 11
2999 a1541 1
3000 The free list should be split into multiple lists to reduce contention.
3001 d1547 39
3002 d1596 7
3003 d1604 1
3004 a1604 1
3005 The algorithm for freeing is simple:
3006 d1608 7
3007 a1614 1
3008 Identify the correct free list.
3009 d1618 30
3010 a1647 1
3011 Lock the list, and place the freed entry at the head.
3012 d1651 7
3013 a1657 2
3014 Allocation is a little more complicated, as we merge entries as we walk
3015  the list:
3016 d1661 19
3017 a1679 1
3018 Pick a free list; either the list we last freed onto, or based on a
3019 d1691 17
3020 a1707 1
3021 Lock that list.
3022 d1711 7
3023 a1717 1
3024 If the top entry is well-sized, remove it from the list and return it.
3025 d1721 5
3026 a1725 1
3027 Otherwise, examine the entry to the right of it in the file.
3028 d1731 2
3029 d1737 2
3030 d1743 2
3031 d1749 2
3032 d1756 8
3033 d1765 2
3034 d1770 2
3035 d1773 2
3036 d1778 7
3037 a1784 1
3038 If no list satisfies, expand the file.
3039 d1788 28
3040 a1815 2
3041 This optimizes rapid insert/delete of free list entries, and allows us to
3042  get rid of the tailer altogether.
3043 d1819 2
3044 d1851 1
3045 a1851 1
3046 \change_inserted 0 1272941474
3047 d1857 303
3048 a2159 18
3049 \change_inserted 0 1272942759
3050 There are various ways to organize these lists, but because we want to be
3051  able to quickly identify which free list an entry is in, and reduce the
3052  number of locks required for merging, we will use zoning (eg.
3053  each of the N free lists in a tdb file of size M covers a fixed fraction
3054  M/N).
3055  Note that this means we need to reshuffle the free lists when we expand
3056  the file; this is probably acceptable when we double the hash table size,
3057  since that is such an expensive operation already.
3058  In the case of increasing the file size, there is an optimization we can
3059  use: if we use M in the formula above as the file size rounded up to the
3060  next power of 2, we only need reshuffle free lists when the file size crosses
3061  a power of 2 boundary,
3062 \emph on
3063 and
3064 \emph default
3065 reshuffling the free lists is trivial: we simply merge every consecutive
3066  pair of free lists.
3067 d2164 107
3068 d2276 2
3069 d2280 59
3070 d2346 2
3071 d2363 2
3072 d2366 2
3073 d2371 2
3074 d2382 2
3075 d2389 57
3076 d2458 13
3077 d2474 32
3078 a2505 2
3079 We could implement snapshots using a similar method to the above, only using
3080  multiple different hash tables/free tables.
3081 @
3082
3083
3084 1.1
3085 log
3086 @Initial revision
3087 @
3088 text
3089 @d1 1
3090 a1 1
3091 #LyX 1.6.4 created this file. For more info see http://www.lyx.org/
3092 d36 3
3093 a38 3
3094 \tracking_changes false
3095 \output_changes false
3096 \author ""
3097 d662 5
3098 a666 1
3099  behavior of disallowing transactions should become the default.
3100 d1215 21
3101 d1527 2
3102 d1533 3
3103 a1535 1
3104  The algorithm for freeing is simple:
3105 d1642 26
3106 @