2 * Copyright 2000-2002 by Hans Reiser, licensing governed by reiserfs/README
12 ** directory_part_size
16 ** are_leaves_removable
20 ** is_left_neighbor_in_cache
24 ** can_node_be_removed
26 ** dc_check_balance_internal
27 ** dc_check_balance_leaf
38 #include <linux/config.h>
39 #include <linux/sched.h>
40 #include <linux/string.h>
41 #include <linux/locks.h>
42 #include <linux/reiserfs_fs.h>
45 /* To make any changes in the tree we find a node, that contains item
46 to be changed/deleted or position in the node we insert a new item
47 to. We call this node S. To do balancing we need to decide what we
48 will shift to left/right neighbor, or to a new node, where new item
49 will be etc. To make this analysis simpler we build virtual
50 node. Virtual node is an array of items, that will replace items of
51 node S. (For instance if we are going to delete an item, virtual
52 node does not contain it). Virtual node keeps information about
53 item sizes and types, mergeability of first and last items, sizes
54 of all entries in directory item. We use this array of items when
55 calculating what we can shift to neighbors and how many nodes we
56 have to have if we do not any shiftings, if we shift to left/right
57 neighbor or to both. */
60 /* taking item number in virtual node, returns number of item, that it has in source buffer */
61 static inline int old_item_num (int new_num, int affected_item_num, int mode)
63 if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
66 if (mode == M_INSERT) {
69 "vs-8005: for INSERT mode and item number of inserted item");
74 RFALSE( mode != M_DELETE,
75 "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'", mode);
80 static void create_virtual_node (struct tree_balance * tb, int h)
82 struct item_head * ih;
83 struct virtual_node * vn = tb->tb_vn;
85 struct buffer_head * Sh; /* this comes from tb->S[h] */
87 Sh = PATH_H_PBUFFER (tb->tb_path, h);
89 /* size of changed node */
90 vn->vn_size = MAX_CHILD_SIZE (Sh) - B_FREE_SPACE (Sh) + tb->insert_size[h];
92 /* for internal nodes array if virtual items is not created */
94 vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE);
98 /* number of items in virtual node */
99 vn->vn_nr_item = B_NR_ITEMS (Sh) + ((vn->vn_mode == M_INSERT)? 1 : 0) - ((vn->vn_mode == M_DELETE)? 1 : 0);
101 /* first virtual item */
102 vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1);
103 memset (vn->vn_vi, 0, vn->vn_nr_item * sizeof (struct virtual_item));
104 vn->vn_free_ptr += vn->vn_nr_item * sizeof (struct virtual_item);
107 /* first item in the node */
108 ih = B_N_PITEM_HEAD (Sh, 0);
110 /* define the mergeability for 0-th item (if it is not being deleted) */
111 if (op_is_left_mergeable (&(ih->ih_key), Sh->b_size) && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
112 vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
114 /* go through all items those remain in the virtual node (except for the new (inserted) one) */
115 for (new_num = 0; new_num < vn->vn_nr_item; new_num ++) {
117 struct virtual_item * vi = vn->vn_vi + new_num;
118 int is_affected = ((new_num != vn->vn_affected_item_num) ? 0 : 1);
121 if (is_affected && vn->vn_mode == M_INSERT)
124 /* get item number in source node */
125 j = old_item_num (new_num, vn->vn_affected_item_num, vn->vn_mode);
127 vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
129 vi->vi_item = B_I_PITEM (Sh, ih + j);
130 vi->vi_uarea = vn->vn_free_ptr;
132 // FIXME: there is no check, that item operation did not
133 // consume too much memory
134 vn->vn_free_ptr += op_create_vi (vn, vi, is_affected, tb->insert_size [0]);
135 if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
136 reiserfs_panic (tb->tb_sb, "vs-8030: create_virtual_node: "
137 "virtual node space consumed");
140 /* this is not being changed */
143 if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
144 vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
145 vi->vi_new_data = vn->vn_data; // pointer to data which is going to be pasted
150 /* virtual inserted item is not defined yet */
151 if (vn->vn_mode == M_INSERT) {
152 struct virtual_item * vi = vn->vn_vi + vn->vn_affected_item_num;
154 RFALSE( vn->vn_ins_ih == 0,
155 "vs-8040: item header of inserted item is not specified");
156 vi->vi_item_len = tb->insert_size[0];
157 vi->vi_ih = vn->vn_ins_ih;
158 vi->vi_item = vn->vn_data;
159 vi->vi_uarea = vn->vn_free_ptr;
161 op_create_vi (vn, vi, 0/*not pasted or cut*/, tb->insert_size [0]);
164 /* set right merge flag we take right delimiting key and check whether it is a mergeable item */
168 key = B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0]);
169 if (op_is_left_mergeable (key, Sh->b_size) && (vn->vn_mode != M_DELETE ||
170 vn->vn_affected_item_num != B_NR_ITEMS (Sh) - 1))
171 vn->vn_vi[vn->vn_nr_item-1].vi_type |= VI_TYPE_RIGHT_MERGEABLE;
173 #ifdef CONFIG_REISERFS_CHECK
174 if (op_is_left_mergeable (key, Sh->b_size) &&
175 !(vn->vn_mode != M_DELETE || vn->vn_affected_item_num != B_NR_ITEMS (Sh) - 1) ) {
176 /* we delete last item and it could be merged with right neighbor's first item */
177 if (!(B_NR_ITEMS (Sh) == 1 && is_direntry_le_ih (B_N_PITEM_HEAD (Sh, 0)) &&
178 I_ENTRY_COUNT (B_N_PITEM_HEAD (Sh, 0)) == 1)) {
179 /* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */
180 print_block (Sh, 0, -1, -1);
181 reiserfs_panic (tb->tb_sb, "vs-8045: create_virtual_node: rdkey %k, affected item==%d (mode==%c) Must be %c",
182 key, vn->vn_affected_item_num, vn->vn_mode, M_DELETE);
184 /* we can delete directory item, that has only one directory entry in it */
193 /* using virtual node check, how many items can be shifted to left
195 static void check_left (struct tree_balance * tb, int h, int cur_free)
198 struct virtual_node * vn = tb->tb_vn;
199 struct virtual_item * vi;
202 RFALSE( cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free);
206 tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
212 if (!cur_free || !vn->vn_nr_item) {
213 /* no free space or nothing to move */
219 RFALSE( !PATH_H_PPARENT (tb->tb_path, 0),
220 "vs-8055: parent does not exist or invalid");
223 if ((unsigned int)cur_free >= (vn->vn_size - ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) {
224 /* all contents of S[0] fits into L[0] */
226 RFALSE( vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
227 "vs-8055: invalid mode or balance condition failed");
229 tb->lnum[0] = vn->vn_nr_item;
235 d_size = 0, ih_size = IH_SIZE;
237 /* first item may be merge with last item in left neighbor */
238 if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE)
239 d_size = -((int)IH_SIZE), ih_size = 0;
242 for (i = 0; i < vn->vn_nr_item; i ++, ih_size = IH_SIZE, d_size = 0, vi ++) {
243 d_size += vi->vi_item_len;
244 if (cur_free >= d_size) {
245 /* the item can be shifted entirely */
251 /* the item cannot be shifted entirely, try to split it */
252 /* check whether L[0] can hold ih and at least one byte of the item body */
253 if (cur_free <= ih_size) {
254 /* cannot shift even a part of the current item */
260 tb->lbytes = op_check_left (vi, cur_free, 0, 0);
261 if (tb->lbytes != -1)
262 /* count partially shifted item */
272 /* using virtual node check, how many items can be shifted to right
274 static void check_right (struct tree_balance * tb, int h, int cur_free)
277 struct virtual_node * vn = tb->tb_vn;
278 struct virtual_item * vi;
281 RFALSE( cur_free < 0, "vs-8070: cur_free < 0");
285 tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
291 if (!cur_free || !vn->vn_nr_item) {
298 RFALSE( !PATH_H_PPARENT (tb->tb_path, 0),
299 "vs-8075: parent does not exist or invalid");
301 vi = vn->vn_vi + vn->vn_nr_item - 1;
302 if ((unsigned int)cur_free >= (vn->vn_size - ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) {
303 /* all contents of S[0] fits into R[0] */
305 RFALSE( vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
306 "vs-8080: invalid mode or balance condition failed");
308 tb->rnum[h] = vn->vn_nr_item;
313 d_size = 0, ih_size = IH_SIZE;
315 /* last item may be merge with first item in right neighbor */
316 if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE)
317 d_size = -(int)IH_SIZE, ih_size = 0;
320 for (i = vn->vn_nr_item - 1; i >= 0; i --, d_size = 0, ih_size = IH_SIZE, vi --) {
321 d_size += vi->vi_item_len;
322 if (cur_free >= d_size) {
323 /* the item can be shifted entirely */
329 /* check whether R[0] can hold ih and at least one byte of the item body */
330 if ( cur_free <= ih_size ) { /* cannot shift even a part of the current item */
335 /* R[0] can hold the header of the item and at least one byte of its body */
336 cur_free -= ih_size; /* cur_free is still > 0 */
338 tb->rbytes = op_check_right (vi, cur_free);
339 if (tb->rbytes != -1)
340 /* count partially shifted item */
351 * from - number of items, which are shifted to left neighbor entirely
352 * to - number of item, which are shifted to right neighbor entirely
353 * from_bytes - number of bytes of boundary item (or directory entries) which are shifted to left neighbor
354 * to_bytes - number of bytes of boundary item (or directory entries) which are shifted to right neighbor */
355 static int get_num_ver (int mode, struct tree_balance * tb, int h,
356 int from, int from_bytes,
357 int to, int to_bytes,
358 short * snum012, int flow
365 struct virtual_node * vn = tb->tb_vn;
366 // struct virtual_item * vi;
368 int total_node_size, max_node_size, current_item_size;
370 int start_item, /* position of item we start filling node from */
371 end_item, /* position of item we finish filling node by */
372 start_bytes,/* number of first bytes (entries for directory) of start_item-th item
373 we do not include into node that is being filled */
374 end_bytes; /* number of last bytes (entries for directory) of end_item-th item
375 we do node include into node that is being filled */
376 int split_item_positions[2]; /* these are positions in virtual item of
377 items, that are split between S[0] and
378 S1new and S1new and S2new */
380 split_item_positions[0] = -1;
381 split_item_positions[1] = -1;
383 /* We only create additional nodes if we are in insert or paste mode
384 or we are in replace mode at the internal level. If h is 0 and
385 the mode is M_REPLACE then in fix_nodes we change the mode to
386 paste or insert before we get here in the code. */
387 RFALSE( tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE),
388 "vs-8100: insert_size < 0 in overflow");
390 max_node_size = MAX_CHILD_SIZE (PATH_H_PBUFFER (tb->tb_path, h));
392 /* snum012 [0-2] - number of items, that lay
393 to S[0], first new node and second new node */
394 snum012[3] = -1; /* s1bytes */
395 snum012[4] = -1; /* s2bytes */
399 i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE);
400 if (i == max_node_size)
402 return (i / max_node_size + 1);
408 cur_free = max_node_size;
410 // start from 'from'-th item
412 // skip its first 'start_bytes' units
413 start_bytes = ((from_bytes != -1) ? from_bytes : 0);
415 // last included item is the 'end_item'-th one
416 end_item = vn->vn_nr_item - to - 1;
417 // do not count last 'end_bytes' units of 'end_item'-th item
418 end_bytes = (to_bytes != -1) ? to_bytes : 0;
420 /* go through all item beginning from the start_item-th item and ending by
421 the end_item-th item. Do not count first 'start_bytes' units of
422 'start_item'-th item and last 'end_bytes' of 'end_item'-th item */
424 for (i = start_item; i <= end_item; i ++) {
425 struct virtual_item * vi = vn->vn_vi + i;
426 int skip_from_end = ((i == end_item) ? end_bytes : 0);
428 RFALSE( needed_nodes > 3, "vs-8105: too many nodes are needed");
430 /* get size of current item */
431 current_item_size = vi->vi_item_len;
433 /* do not take in calculation head part (from_bytes) of from-th item */
434 current_item_size -= op_part_size (vi, 0/*from start*/, start_bytes);
436 /* do not take in calculation tail part of last item */
437 current_item_size -= op_part_size (vi, 1/*from end*/, skip_from_end);
439 /* if item fits into current node entierly */
440 if (total_node_size + current_item_size <= max_node_size) {
441 snum012[needed_nodes - 1] ++;
442 total_node_size += current_item_size;
447 if (current_item_size > max_node_size) {
448 /* virtual item length is longer, than max size of item in
449 a node. It is impossible for direct item */
450 RFALSE( is_direct_le_ih (vi->vi_ih),
452 "direct item length is %d. It can not be longer than %d",
453 current_item_size, max_node_size);
454 /* we will try to split it */
459 /* as we do not split items, take new node and continue */
460 needed_nodes ++; i --; total_node_size = 0;
464 // calculate number of item units which fit into node being
469 free_space = max_node_size - total_node_size - IH_SIZE;
470 units = op_check_left (vi, free_space, start_bytes, skip_from_end);
472 /* nothing fits into current node, take new node and continue */
473 needed_nodes ++, i--, total_node_size = 0;
478 /* something fits into the current node */
479 //if (snum012[3] != -1 || needed_nodes != 1)
480 // reiserfs_panic (tb->tb_sb, "vs-8115: get_num_ver: too many nodes required");
481 //snum012[needed_nodes - 1 + 3] = op_unit_num (vi) - start_bytes - units;
482 start_bytes += units;
483 snum012[needed_nodes - 1 + 3] = units;
485 if (needed_nodes > 2)
486 reiserfs_warning (tb->tb_sb, "vs-8111: get_num_ver: split_item_position is out of boundary\n");
487 snum012[needed_nodes - 1] ++;
488 split_item_positions[needed_nodes - 1] = i;
490 /* continue from the same item with start_bytes != -1 */
496 // sum012[4] (if it is not -1) contains number of units of which
497 // are to be in S1new, snum012[3] - to be in S0. They are supposed
498 // to be S1bytes and S2bytes correspondingly, so recalculate
499 if (snum012[4] > 0) {
501 int bytes_to_r, bytes_to_l;
504 split_item_num = split_item_positions[1];
505 bytes_to_l = ((from == split_item_num && from_bytes != -1) ? from_bytes : 0);
506 bytes_to_r = ((end_item == split_item_num && end_bytes != -1) ? end_bytes : 0);
507 bytes_to_S1new = ((split_item_positions[0] == split_item_positions[1]) ? snum012[3] : 0);
510 snum012[4] = op_unit_num (&vn->vn_vi[split_item_num]) - snum012[4] - bytes_to_r - bytes_to_l - bytes_to_S1new;
512 if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY)
513 reiserfs_warning (tb->tb_sb, "vs-8115: get_num_ver: not directory item\n");
516 /* now we know S2bytes, calculate S1bytes */
517 if (snum012[3] > 0) {
519 int bytes_to_r, bytes_to_l;
522 split_item_num = split_item_positions[0];
523 bytes_to_l = ((from == split_item_num && from_bytes != -1) ? from_bytes : 0);
524 bytes_to_r = ((end_item == split_item_num && end_bytes != -1) ? end_bytes : 0);
525 bytes_to_S2new = ((split_item_positions[0] == split_item_positions[1] && snum012[4] != -1) ? snum012[4] : 0);
528 snum012[3] = op_unit_num (&vn->vn_vi[split_item_num]) - snum012[3] - bytes_to_r - bytes_to_l - bytes_to_S2new;
535 #ifdef CONFIG_REISERFS_CHECK
536 extern struct tree_balance * cur_tb;
540 /* Set parameters for balancing.
541 * Performs write of results of analysis of balancing into structure tb,
542 * where it will later be used by the functions that actually do the balancing.
544 * tb tree_balance structure;
545 * h current level of the node;
546 * lnum number of items from S[h] that must be shifted to L[h];
547 * rnum number of items from S[h] that must be shifted to R[h];
548 * blk_num number of blocks that S[h] will be splitted into;
549 * s012 number of items that fall into splitted nodes.
550 * lbytes number of bytes which flow to the left neighbor from the item that is not
551 * not shifted entirely
552 * rbytes number of bytes which flow to the right neighbor from the item that is not
553 * not shifted entirely
554 * s1bytes number of bytes which flow to the first new node when S[0] splits (this number is contained in s012 array)
557 static void set_parameters (struct tree_balance * tb, int h, int lnum,
558 int rnum, int blk_num, short * s012, int lb, int rb)
563 tb->blknum[h] = blk_num;
566 { /* only for leaf level */
569 tb->s0num = * s012 ++,
570 tb->s1num = * s012 ++,
571 tb->s2num = * s012 ++;
572 tb->s1bytes = * s012 ++;
573 tb->s2bytes = * s012;
578 PROC_INFO_ADD( tb -> tb_sb, lnum[ h ], lnum );
579 PROC_INFO_ADD( tb -> tb_sb, rnum[ h ], rnum );
581 PROC_INFO_ADD( tb -> tb_sb, lbytes[ h ], lb );
582 PROC_INFO_ADD( tb -> tb_sb, rbytes[ h ], rb );
587 /* check, does node disappear if we shift tb->lnum[0] items to left
588 neighbor and tb->rnum[0] to the right one. */
589 static int is_leaf_removable (struct tree_balance * tb)
591 struct virtual_node * vn = tb->tb_vn;
592 int to_left, to_right;
596 /* number of items, that will be shifted to left (right) neighbor
598 to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
599 to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
600 remain_items = vn->vn_nr_item;
602 /* how many items remain in S[0] after shiftings to neighbors */
603 remain_items -= (to_left + to_right);
605 if (remain_items < 1) {
606 /* all content of node can be shifted to neighbors */
607 set_parameters (tb, 0, to_left, vn->vn_nr_item - to_left, 0, NULL, -1, -1);
611 if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
612 /* S[0] is not removable */
615 /* check, whether we can divide 1 remaining item between neighbors */
617 /* get size of remaining item (in item units) */
618 size = op_unit_num (&(vn->vn_vi[to_left]));
620 if (tb->lbytes + tb->rbytes >= size) {
621 set_parameters (tb, 0, to_left + 1, to_right + 1, 0, NULL, tb->lbytes, -1);
629 /* check whether L, S, R can be joined in one node */
630 static int are_leaves_removable (struct tree_balance * tb, int lfree, int rfree)
632 struct virtual_node * vn = tb->tb_vn;
634 struct buffer_head *S0;
636 S0 = PATH_H_PBUFFER (tb->tb_path, 0);
639 if (vn->vn_nr_item) {
640 if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE)
643 if (vn->vn_vi[vn->vn_nr_item-1].vi_type & VI_TYPE_RIGHT_MERGEABLE)
646 /* there was only one item and it will be deleted */
647 struct item_head * ih;
649 RFALSE( B_NR_ITEMS (S0) != 1,
650 "vs-8125: item number must be 1: it is %d", B_NR_ITEMS(S0));
652 ih = B_N_PITEM_HEAD (S0, 0);
653 if (tb->CFR[0] && !comp_short_le_keys (&(ih->ih_key), B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0])))
654 if (is_direntry_le_ih (ih)) {
655 /* Directory must be in correct state here: that is
656 somewhere at the left side should exist first directory
657 item. But the item being deleted can not be that first
658 one because its right neighbor is item of the same
659 directory. (But first item always gets deleted in last
660 turn). So, neighbors of deleted item can be merged, so
661 we can save ih_size */
664 /* we might check that left neighbor exists and is of the
666 RFALSE(le_ih_k_offset (ih) == DOT_OFFSET,
667 "vs-8130: first directory item can not be removed until directory is not empty");
672 if (MAX_CHILD_SIZE (S0) + vn->vn_size <= rfree + lfree + ih_size) {
673 set_parameters (tb, 0, -1, -1, -1, NULL, -1, -1);
674 PROC_INFO_INC( tb -> tb_sb, leaves_removable );
683 /* when we do not split item, lnum and rnum are numbers of entire items */
684 #define SET_PAR_SHIFT_LEFT \
689 to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\
690 (MAX_NR_KEY(Sh) + 1 - lpar);\
692 set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\
696 if (lset==LEFT_SHIFT_FLOW)\
697 set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\
700 set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\
705 #define SET_PAR_SHIFT_RIGHT \
710 to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\
712 set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\
716 if (rset==RIGHT_SHIFT_FLOW)\
717 set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\
720 set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\
725 void free_buffers_in_tb (
726 struct tree_balance * p_s_tb
730 decrement_counters_in_path(p_s_tb->tb_path);
732 for ( n_counter = 0; n_counter < MAX_HEIGHT; n_counter++ ) {
733 decrement_bcount(p_s_tb->L[n_counter]);
734 p_s_tb->L[n_counter] = NULL;
735 decrement_bcount(p_s_tb->R[n_counter]);
736 p_s_tb->R[n_counter] = NULL;
737 decrement_bcount(p_s_tb->FL[n_counter]);
738 p_s_tb->FL[n_counter] = NULL;
739 decrement_bcount(p_s_tb->FR[n_counter]);
740 p_s_tb->FR[n_counter] = NULL;
741 decrement_bcount(p_s_tb->CFL[n_counter]);
742 p_s_tb->CFL[n_counter] = NULL;
743 decrement_bcount(p_s_tb->CFR[n_counter]);
744 p_s_tb->CFR[n_counter] = NULL;
749 /* Get new buffers for storing new nodes that are created while balancing.
750 * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
751 * CARRY_ON - schedule didn't occur while the function worked;
752 * NO_DISK_SPACE - no disk space.
754 /* The function is NOT SCHEDULE-SAFE! */
755 static int get_empty_nodes(
756 struct tree_balance * p_s_tb,
759 struct buffer_head * p_s_new_bh,
760 * p_s_Sh = PATH_H_PBUFFER (p_s_tb->tb_path, n_h);
761 unsigned long * p_n_blocknr,
762 a_n_blocknrs[MAX_AMOUNT_NEEDED] = {0, };
765 n_amount_needed,/* number of needed empty blocks */
767 struct super_block * p_s_sb = p_s_tb->tb_sb;
770 /* number_of_freeblk is the number of empty blocks which have been
771 acquired for use by the balancing algorithm minus the number of
772 empty blocks used in the previous levels of the analysis,
773 number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs
774 after empty blocks are acquired, and the balancing analysis is
775 then restarted, amount_needed is the number needed by this level
776 (n_h) of the balancing analysis.
778 Note that for systems with many processes writing, it would be
779 more layout optimal to calculate the total number needed by all
780 levels and then to run reiserfs_new_blocks to get all of them at once. */
782 /* Initiate number_of_freeblk to the amount acquired prior to the restart of
783 the analysis or 0 if not restarted, then subtract the amount needed
784 by all of the levels of the tree below n_h. */
785 /* blknum includes S[n_h], so we subtract 1 in this calculation */
786 for ( n_counter = 0, n_number_of_freeblk = p_s_tb->cur_blknum; n_counter < n_h; n_counter++ )
787 n_number_of_freeblk -= ( p_s_tb->blknum[n_counter] ) ? (p_s_tb->blknum[n_counter] - 1) : 0;
789 /* Allocate missing empty blocks. */
790 /* if p_s_Sh == 0 then we are getting a new root */
791 n_amount_needed = ( p_s_Sh ) ? (p_s_tb->blknum[n_h] - 1) : 1;
792 /* Amount_needed = the amount that we need more than the amount that we have. */
793 if ( n_amount_needed > n_number_of_freeblk )
794 n_amount_needed -= n_number_of_freeblk;
795 else /* If we have enough already then there is nothing to do. */
798 if ( reiserfs_new_form_blocknrs (p_s_tb, a_n_blocknrs,
799 n_amount_needed) == NO_DISK_SPACE )
800 return NO_DISK_SPACE;
802 /* for each blocknumber we just got, get a buffer and stick it on FEB */
803 for ( p_n_blocknr = a_n_blocknrs, n_counter = 0; n_counter < n_amount_needed;
804 p_n_blocknr++, n_counter++ ) {
806 RFALSE( ! *p_n_blocknr,
807 "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
809 p_s_new_bh = getblk(p_s_sb->s_dev, *p_n_blocknr, p_s_sb->s_blocksize);
810 if (atomic_read (&(p_s_new_bh->b_count)) > 1) {
811 /*&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&*/
813 reiserfs_warning (p_s_sb, "waiting for buffer %b, iput inode pid = %d, this pid %d, mode %c, %h\n",
814 p_s_new_bh, put_inode_pid, current->pid, p_s_tb->tb_vn->vn_mode, p_s_tb->tb_vn->vn_ins_ih);
815 print_tb (0, 0, 0, p_s_tb, "tb");
817 /*&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&*/
818 if (atomic_read(&(p_s_new_bh->b_count)) > 2 ||
819 !(buffer_journaled(p_s_new_bh) || buffer_journal_dirty(p_s_new_bh))) {
820 n_retval = REPEAT_SEARCH ;
821 free_buffers_in_tb (p_s_tb);
822 wait_buffer_until_released (p_s_new_bh);
825 RFALSE( (atomic_read (&(p_s_new_bh->b_count)) != 1 ||
826 buffer_dirty (p_s_new_bh)) &&
827 (atomic_read(&(p_s_new_bh->b_count)) > 2 ||
828 !(buffer_journaled(p_s_new_bh) ||
829 buffer_journal_dirty(p_s_new_bh))),
830 "PAP-8140: not free or dirty buffer %b for the new block",
833 /* Put empty buffers into the array. */
834 if (p_s_tb->FEB[p_s_tb->cur_blknum])
837 mark_buffer_journal_new(p_s_new_bh) ;
838 p_s_tb->FEB[p_s_tb->cur_blknum++] = p_s_new_bh;
841 if ( n_retval == CARRY_ON && FILESYSTEM_CHANGED_TB (p_s_tb) )
842 n_retval = REPEAT_SEARCH ;
848 /* Get free space of the left neighbor, which is stored in the parent
849 * node of the left neighbor. */
850 static int get_lfree (struct tree_balance * tb, int h)
852 struct buffer_head * l, * f;
855 if ((f = PATH_H_PPARENT (tb->tb_path, h)) == 0 || (l = tb->FL[h]) == 0)
859 order = PATH_H_B_ITEM_ORDER (tb->tb_path, h) - 1;
861 order = B_NR_ITEMS (l);
865 return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f,order)));
869 /* Get free space of the right neighbor,
870 * which is stored in the parent node of the right neighbor.
872 static int get_rfree (struct tree_balance * tb, int h)
874 struct buffer_head * r, * f;
877 if ((f = PATH_H_PPARENT (tb->tb_path, h)) == 0 || (r = tb->FR[h]) == 0)
881 order = PATH_H_B_ITEM_ORDER (tb->tb_path, h) + 1;
887 return (MAX_CHILD_SIZE(f) - dc_size( B_N_CHILD(f,order)));
892 /* Check whether left neighbor is in memory. */
893 static int is_left_neighbor_in_cache(
894 struct tree_balance * p_s_tb,
897 struct buffer_head * p_s_father, * left;
898 struct super_block * p_s_sb = p_s_tb->tb_sb;
899 unsigned long n_left_neighbor_blocknr;
900 int n_left_neighbor_position;
902 if ( ! p_s_tb->FL[n_h] ) /* Father of the left neighbor does not exist. */
905 /* Calculate father of the node to be balanced. */
906 p_s_father = PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1);
908 RFALSE( ! p_s_father ||
909 ! B_IS_IN_TREE (p_s_father) ||
910 ! B_IS_IN_TREE (p_s_tb->FL[n_h]) ||
911 ! buffer_uptodate (p_s_father) ||
912 ! buffer_uptodate (p_s_tb->FL[n_h]),
913 "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
914 p_s_father, p_s_tb->FL[n_h]);
917 /* Get position of the pointer to the left neighbor into the left father. */
918 n_left_neighbor_position = ( p_s_father == p_s_tb->FL[n_h] ) ?
919 p_s_tb->lkey[n_h] : B_NR_ITEMS (p_s_tb->FL[n_h]);
920 /* Get left neighbor block number. */
921 n_left_neighbor_blocknr = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_left_neighbor_position);
922 /* Look for the left neighbor in the cache. */
923 if ( (left = sb_get_hash_table(p_s_sb, n_left_neighbor_blocknr)) ) {
925 RFALSE( buffer_uptodate (left) && ! B_IS_IN_TREE(left),
926 "vs-8170: left neighbor (%b %z) is not in the tree", left, left);
935 #define LEFT_PARENTS 'l'
936 #define RIGHT_PARENTS 'r'
939 static void decrement_key (struct cpu_key * p_s_key)
941 // call item specific function for this key
942 item_ops[cpu_key_k_type (p_s_key)]->decrement_key (p_s_key);
948 /* Calculate far left/right parent of the left/right neighbor of the current node, that
949 * is calculate the left/right (FL[h]/FR[h]) neighbor of the parent F[h].
950 * Calculate left/right common parent of the current node and L[h]/R[h].
951 * Calculate left/right delimiting key position.
952 * Returns: PATH_INCORRECT - path in the tree is not correct;
953 SCHEDULE_OCCURRED - schedule occurred while the function worked;
954 * CARRY_ON - schedule didn't occur while the function worked;
956 static int get_far_parent (struct tree_balance * p_s_tb,
958 struct buffer_head ** pp_s_father,
959 struct buffer_head ** pp_s_com_father,
962 struct buffer_head * p_s_parent;
963 INITIALIZE_PATH (s_path_to_neighbor_father);
964 struct path * p_s_path = p_s_tb->tb_path;
965 struct cpu_key s_lr_father_key;
967 n_position = INT_MAX,
968 n_first_last_position = 0,
969 n_path_offset = PATH_H_PATH_OFFSET(p_s_path, n_h);
971 /* Starting from F[n_h] go upwards in the tree, and look for the common
972 ancestor of F[n_h], and its neighbor l/r, that should be obtained. */
974 n_counter = n_path_offset;
976 RFALSE( n_counter < FIRST_PATH_ELEMENT_OFFSET,
977 "PAP-8180: invalid path length");
980 for ( ; n_counter > FIRST_PATH_ELEMENT_OFFSET; n_counter-- ) {
981 /* Check whether parent of the current buffer in the path is really parent in the tree. */
982 if ( ! B_IS_IN_TREE(p_s_parent = PATH_OFFSET_PBUFFER(p_s_path, n_counter - 1)) )
983 return REPEAT_SEARCH;
984 /* Check whether position in the parent is correct. */
985 if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_counter - 1)) > B_NR_ITEMS(p_s_parent) )
986 return REPEAT_SEARCH;
987 /* Check whether parent at the path really points to the child. */
988 if ( B_N_CHILD_NUM(p_s_parent, n_position) !=
989 PATH_OFFSET_PBUFFER(p_s_path, n_counter)->b_blocknr )
990 return REPEAT_SEARCH;
991 /* Return delimiting key if position in the parent is not equal to first/last one. */
992 if ( c_lr_par == RIGHT_PARENTS )
993 n_first_last_position = B_NR_ITEMS (p_s_parent);
994 if ( n_position != n_first_last_position ) {
995 *pp_s_com_father = p_s_parent;
996 get_bh(*pp_s_com_father) ;
997 /*(*pp_s_com_father = p_s_parent)->b_count++;*/
1002 /* if we are in the root of the tree, then there is no common father */
1003 if ( n_counter == FIRST_PATH_ELEMENT_OFFSET ) {
1004 /* Check whether first buffer in the path is the root of the tree. */
1005 if ( PATH_OFFSET_PBUFFER(p_s_tb->tb_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
1006 SB_ROOT_BLOCK (p_s_tb->tb_sb) ) {
1007 *pp_s_father = *pp_s_com_father = NULL;
1010 return REPEAT_SEARCH;
1013 RFALSE( B_LEVEL (*pp_s_com_father) <= DISK_LEAF_NODE_LEVEL,
1014 "PAP-8185: (%b %z) level too small",
1015 *pp_s_com_father, *pp_s_com_father);
1017 /* Check whether the common parent is locked. */
1019 if ( buffer_locked (*pp_s_com_father) ) {
1020 __wait_on_buffer(*pp_s_com_father);
1021 if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
1022 decrement_bcount(*pp_s_com_father);
1023 return REPEAT_SEARCH;
1027 /* So, we got common parent of the current node and its left/right neighbor.
1028 Now we are geting the parent of the left/right neighbor. */
1030 /* Form key to get parent of the left/right neighbor. */
1031 le_key2cpu_key (&s_lr_father_key, B_N_PDELIM_KEY(*pp_s_com_father, ( c_lr_par == LEFT_PARENTS ) ?
1032 (p_s_tb->lkey[n_h - 1] = n_position - 1) : (p_s_tb->rkey[n_h - 1] = n_position)));
1035 if ( c_lr_par == LEFT_PARENTS )
1036 decrement_key(&s_lr_father_key);
1038 if (search_by_key(p_s_tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father, n_h + 1) == IO_ERROR)
1042 if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
1043 decrement_counters_in_path(&s_path_to_neighbor_father);
1044 decrement_bcount(*pp_s_com_father);
1045 return REPEAT_SEARCH;
1048 *pp_s_father = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
1050 RFALSE( B_LEVEL (*pp_s_father) != n_h + 1,
1051 "PAP-8190: (%b %z) level too small", *pp_s_father, *pp_s_father);
1052 RFALSE( s_path_to_neighbor_father.path_length < FIRST_PATH_ELEMENT_OFFSET,
1053 "PAP-8192: path length is too small");
1055 s_path_to_neighbor_father.path_length--;
1056 decrement_counters_in_path(&s_path_to_neighbor_father);
1061 /* Get parents of neighbors of node in the path(S[n_path_offset]) and common parents of
1062 * S[n_path_offset] and L[n_path_offset]/R[n_path_offset]: F[n_path_offset], FL[n_path_offset],
1063 * FR[n_path_offset], CFL[n_path_offset], CFR[n_path_offset].
1064 * Calculate numbers of left and right delimiting keys position: lkey[n_path_offset], rkey[n_path_offset].
1065 * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
1066 * CARRY_ON - schedule didn't occur while the function worked;
1068 static int get_parents (struct tree_balance * p_s_tb, int n_h)
1070 struct path * p_s_path = p_s_tb->tb_path;
1073 n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
1074 struct buffer_head * p_s_curf,
1077 /* Current node is the root of the tree or will be root of the tree */
1078 if ( n_path_offset <= FIRST_PATH_ELEMENT_OFFSET ) {
1079 /* The root can not have parents.
1080 Release nodes which previously were obtained as parents of the current node neighbors. */
1081 decrement_bcount(p_s_tb->FL[n_h]);
1082 decrement_bcount(p_s_tb->CFL[n_h]);
1083 decrement_bcount(p_s_tb->FR[n_h]);
1084 decrement_bcount(p_s_tb->CFR[n_h]);
1085 p_s_tb->FL[n_h] = p_s_tb->CFL[n_h] = p_s_tb->FR[n_h] = p_s_tb->CFR[n_h] = NULL;
1089 /* Get parent FL[n_path_offset] of L[n_path_offset]. */
1090 if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1)) ) {
1091 /* Current node is not the first child of its parent. */
1092 /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2;*/
1093 p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
1096 p_s_tb->lkey[n_h] = n_position - 1;
1099 /* Calculate current parent of L[n_path_offset], which is the left neighbor of the current node.
1100 Calculate current common parent of L[n_path_offset] and the current node. Note that
1101 CFL[n_path_offset] not equal FL[n_path_offset] and CFL[n_path_offset] not equal F[n_path_offset].
1102 Calculate lkey[n_path_offset]. */
1103 if ( (n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf,
1104 &p_s_curcf, LEFT_PARENTS)) != CARRY_ON )
1108 decrement_bcount(p_s_tb->FL[n_h]);
1109 p_s_tb->FL[n_h] = p_s_curf; /* New initialization of FL[n_h]. */
1110 decrement_bcount(p_s_tb->CFL[n_h]);
1111 p_s_tb->CFL[n_h] = p_s_curcf; /* New initialization of CFL[n_h]. */
1113 RFALSE( (p_s_curf && !B_IS_IN_TREE (p_s_curf)) ||
1114 (p_s_curcf && !B_IS_IN_TREE (p_s_curcf)),
1115 "PAP-8195: FL (%b) or CFL (%b) is invalid", p_s_curf, p_s_curcf);
1117 /* Get parent FR[n_h] of R[n_h]. */
1119 /* Current node is the last child of F[n_h]. FR[n_h] != F[n_h]. */
1120 if ( n_position == B_NR_ITEMS (PATH_H_PBUFFER(p_s_path, n_h + 1)) ) {
1121 /* Calculate current parent of R[n_h], which is the right neighbor of F[n_h].
1122 Calculate current common parent of R[n_h] and current node. Note that CFR[n_h]
1123 not equal FR[n_path_offset] and CFR[n_h] not equal F[n_h]. */
1124 if ( (n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf, &p_s_curcf, RIGHT_PARENTS)) != CARRY_ON )
1128 /* Current node is not the last child of its parent F[n_h]. */
1129 /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2;*/
1130 p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
1133 p_s_tb->rkey[n_h] = n_position;
1136 decrement_bcount(p_s_tb->FR[n_h]);
1137 p_s_tb->FR[n_h] = p_s_curf; /* New initialization of FR[n_path_offset]. */
1139 decrement_bcount(p_s_tb->CFR[n_h]);
1140 p_s_tb->CFR[n_h] = p_s_curcf; /* New initialization of CFR[n_path_offset]. */
1142 RFALSE( (p_s_curf && !B_IS_IN_TREE (p_s_curf)) ||
1143 (p_s_curcf && !B_IS_IN_TREE (p_s_curcf)),
1144 "PAP-8205: FR (%b) or CFR (%b) is invalid", p_s_curf, p_s_curcf);
1150 /* it is possible to remove node as result of shiftings to
1151 neighbors even when we insert or paste item. */
1152 static inline int can_node_be_removed (int mode, int lfree, int sfree, int rfree, struct tree_balance * tb, int h)
1154 struct buffer_head * Sh = PATH_H_PBUFFER (tb->tb_path, h);
1155 int levbytes = tb->insert_size[h];
1156 struct item_head * ih;
1157 struct key * r_key = NULL;
1159 ih = B_N_PITEM_HEAD (Sh, 0);
1161 r_key = B_N_PDELIM_KEY(tb->CFR[h],tb->rkey[h]);
1164 lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
1165 /* shifting may merge items which might save space */
1166 - (( ! h && op_is_left_mergeable (&(ih->ih_key), Sh->b_size) ) ? IH_SIZE : 0)
1167 - (( ! h && r_key && op_is_left_mergeable (r_key, Sh->b_size) ) ? IH_SIZE : 0)
1168 + (( h ) ? KEY_SIZE : 0))
1170 /* node can not be removed */
1171 if (sfree >= levbytes ) { /* new item fits into node S[h] without any shifting */
1173 tb->s0num = B_NR_ITEMS(Sh) + ((mode == M_INSERT ) ? 1 : 0);
1174 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1175 return NO_BALANCING_NEEDED;
1178 PROC_INFO_INC( tb -> tb_sb, can_node_be_removed[ h ] );
1179 return !NO_BALANCING_NEEDED;
1184 /* Check whether current node S[h] is balanced when increasing its size by
1185 * Inserting or Pasting.
1186 * Calculate parameters for balancing for current level h.
1188 * tb tree_balance structure;
1189 * h current level of the node;
1190 * inum item number in S[h];
1191 * mode i - insert, p - paste;
1192 * Returns: 1 - schedule occurred;
1193 * 0 - balancing for higher levels needed;
1194 * -1 - no balancing for higher levels needed;
1195 * -2 - no disk space.
1197 /* ip means Inserting or Pasting */
1198 static int ip_check_balance (struct tree_balance * tb, int h)
1200 struct virtual_node * vn = tb->tb_vn;
1201 int levbytes, /* Number of bytes that must be inserted into (value
1202 is negative if bytes are deleted) buffer which
1203 contains node being balanced. The mnemonic is
1204 that the attempted change in node space used level
1205 is levbytes bytes. */
1208 int lfree, sfree, rfree /* free space in L, S and R */;
1210 /* nver is short for number of vertixes, and lnver is the number if
1211 we shift to the left, rnver is the number if we shift to the
1212 right, and lrnver is the number if we shift in both directions.
1213 The goal is to minimize first the number of vertixes, and second,
1214 the number of vertixes whose contents are changed by shifting,
1215 and third the number of uncached vertixes whose contents are
1216 changed by shifting and must be read from disk. */
1217 int nver, lnver, rnver, lrnver;
1219 /* used at leaf level only, S0 = S[0] is the node being balanced,
1220 sInum [ I = 0,1,2 ] is the number of items that will
1221 remain in node SI after balancing. S1 and S2 are new
1222 nodes that might be created. */
1224 /* we perform 8 calls to get_num_ver(). For each call we calculate five parameters.
1225 where 4th parameter is s1bytes and 5th - s2bytes
1227 short snum012[40] = {0,}; /* s0num, s1num, s2num for 8 cases
1228 0,1 - do not shift and do not shift but bottle
1229 2 - shift only whole item to left
1230 3 - shift to left and bottle as much as possible
1231 4,5 - shift to right (whole items and as much as possible
1232 6,7 - shift to both directions (whole items and as much as possible)
1235 /* Sh is the node whose balance is currently being checked */
1236 struct buffer_head * Sh;
1238 Sh = PATH_H_PBUFFER (tb->tb_path, h);
1239 levbytes = tb->insert_size[h];
1241 /* Calculate balance parameters for creating new root. */
1244 reiserfs_panic (tb->tb_sb, "vs-8210: ip_check_balance: S[0] can not be 0");
1245 switch ( n_ret_value = get_empty_nodes (tb, h) ) {
1247 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1248 return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */
1254 reiserfs_panic(tb->tb_sb, "vs-8215: ip_check_balance: incorrect return value of get_empty_nodes");
1258 if ( (n_ret_value = get_parents (tb, h)) != CARRY_ON ) /* get parents of S[h] neighbors. */
1261 sfree = B_FREE_SPACE (Sh);
1263 /* get free space of neighbors */
1264 rfree = get_rfree (tb, h);
1265 lfree = get_lfree (tb, h);
1267 if (can_node_be_removed (vn->vn_mode, lfree, sfree, rfree, tb, h) == NO_BALANCING_NEEDED)
1268 /* and new item fits into node S[h] without any shifting */
1269 return NO_BALANCING_NEEDED;
1271 create_virtual_node (tb, h);
1274 determine maximal number of items we can shift to the left neighbor (in tb structure)
1275 and the maximal number of bytes that can flow to the left neighbor
1276 from the left most liquid item that cannot be shifted from S[0] entirely (returned value)
1278 check_left (tb, h, lfree);
1281 determine maximal number of items we can shift to the right neighbor (in tb structure)
1282 and the maximal number of bytes that can flow to the right neighbor
1283 from the right most liquid item that cannot be shifted from S[0] entirely (returned value)
1285 check_right (tb, h, rfree);
1288 /* all contents of internal node S[h] can be moved into its
1289 neighbors, S[h] will be removed after balancing */
1290 if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
1293 /* Since we are working on internal nodes, and our internal
1294 nodes have fixed size entries, then we can balance by the
1295 number of items rather than the space they consume. In this
1296 routine we set the left node equal to the right node,
1297 allowing a difference of less than or equal to 1 child
1299 to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 -
1300 (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
1301 set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1);
1305 /* this checks balance condition, that any two neighboring nodes can not fit in one node */
1307 ( tb->lnum[h] >= vn->vn_nr_item + 1 ||
1308 tb->rnum[h] >= vn->vn_nr_item + 1),
1309 "vs-8220: tree is not balanced on internal level");
1310 RFALSE( ! h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) ||
1311 (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1)) ),
1312 "vs-8225: tree is not balanced on leaf level");
1314 /* all contents of S[0] can be moved into its neighbors
1315 S[0] will be removed after balancing. */
1316 if (!h && is_leaf_removable (tb))
1320 /* why do we perform this check here rather than earlier??
1321 Answer: we can win 1 node in some cases above. Moreover we
1322 checked it above, when we checked, that S[0] is not removable
1324 if (sfree >= levbytes) { /* new item fits into node S[h] without any shifting */
1326 tb->s0num = vn->vn_nr_item;
1327 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1328 return NO_BALANCING_NEEDED;
1333 int lpar, rpar, nset, lset, rset, lrset;
1335 * regular overflowing of the node
1338 /* get_num_ver works in 2 modes (FLOW & NO_FLOW)
1339 lpar, rpar - number of items we can shift to left/right neighbor (including splitting item)
1340 nset, lset, rset, lrset - shows, whether flowing items give better packing
1343 #define NO_FLOW 0 /* do not any splitting */
1345 /* we choose one the following */
1346 #define NOTHING_SHIFT_NO_FLOW 0
1347 #define NOTHING_SHIFT_FLOW 5
1348 #define LEFT_SHIFT_NO_FLOW 10
1349 #define LEFT_SHIFT_FLOW 15
1350 #define RIGHT_SHIFT_NO_FLOW 20
1351 #define RIGHT_SHIFT_FLOW 25
1352 #define LR_SHIFT_NO_FLOW 30
1353 #define LR_SHIFT_FLOW 35
1360 /* calculate number of blocks S[h] must be split into when
1361 nothing is shifted to the neighbors,
1362 as well as number of items in each part of the split node (s012 numbers),
1363 and number of bytes (s1bytes) of the shared drop which flow to S1 if any */
1364 nset = NOTHING_SHIFT_NO_FLOW;
1365 nver = get_num_ver (vn->vn_mode, tb, h,
1366 0, -1, h?vn->vn_nr_item:0, -1,
1373 /* note, that in this case we try to bottle between S[0] and S1 (S1 - the first new node) */
1374 nver1 = get_num_ver (vn->vn_mode, tb, h,
1376 snum012 + NOTHING_SHIFT_FLOW, FLOW);
1378 nset = NOTHING_SHIFT_FLOW, nver = nver1;
1382 /* calculate number of blocks S[h] must be split into when
1383 l_shift_num first items and l_shift_bytes of the right most
1384 liquid item to be shifted are shifted to the left neighbor,
1385 as well as number of items in each part of the splitted node (s012 numbers),
1386 and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1388 lset = LEFT_SHIFT_NO_FLOW;
1389 lnver = get_num_ver (vn->vn_mode, tb, h,
1390 lpar - (( h || tb->lbytes == -1 ) ? 0 : 1), -1, h ? vn->vn_nr_item:0, -1,
1391 snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW);
1396 lnver1 = get_num_ver (vn->vn_mode, tb, h,
1397 lpar - ((tb->lbytes != -1) ? 1 : 0), tb->lbytes, 0, -1,
1398 snum012 + LEFT_SHIFT_FLOW, FLOW);
1400 lset = LEFT_SHIFT_FLOW, lnver = lnver1;
1404 /* calculate number of blocks S[h] must be split into when
1405 r_shift_num first items and r_shift_bytes of the left most
1406 liquid item to be shifted are shifted to the right neighbor,
1407 as well as number of items in each part of the splitted node (s012 numbers),
1408 and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1410 rset = RIGHT_SHIFT_NO_FLOW;
1411 rnver = get_num_ver (vn->vn_mode, tb, h,
1412 0, -1, h ? (vn->vn_nr_item-rpar) : (rpar - (( tb->rbytes != -1 ) ? 1 : 0)), -1,
1413 snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW);
1418 rnver1 = get_num_ver (vn->vn_mode, tb, h,
1419 0, -1, (rpar - ((tb->rbytes != -1) ? 1 : 0)), tb->rbytes,
1420 snum012 + RIGHT_SHIFT_FLOW, FLOW);
1423 rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
1427 /* calculate number of blocks S[h] must be split into when
1428 items are shifted in both directions,
1429 as well as number of items in each part of the splitted node (s012 numbers),
1430 and number of bytes (s1bytes) of the shared drop which flow to S1 if any
1432 lrset = LR_SHIFT_NO_FLOW;
1433 lrnver = get_num_ver (vn->vn_mode, tb, h,
1434 lpar - ((h || tb->lbytes == -1) ? 0 : 1), -1, h ? (vn->vn_nr_item-rpar):(rpar - ((tb->rbytes != -1) ? 1 : 0)), -1,
1435 snum012 + LR_SHIFT_NO_FLOW, NO_FLOW);
1440 lrnver1 = get_num_ver (vn->vn_mode, tb, h,
1441 lpar - ((tb->lbytes != -1) ? 1 : 0), tb->lbytes, (rpar - ((tb->rbytes != -1) ? 1 : 0)), tb->rbytes,
1442 snum012 + LR_SHIFT_FLOW, FLOW);
1443 if (lrnver > lrnver1)
1444 lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
1449 /* Our general shifting strategy is:
1450 1) to minimized number of new nodes;
1451 2) to minimized number of neighbors involved in shifting;
1452 3) to minimized number of disk reads; */
1454 /* we can win TWO or ONE nodes by shifting in both directions */
1455 if (lrnver < lnver && lrnver < rnver)
1458 (tb->lnum[h] != 1 ||
1460 lrnver != 1 || rnver != 2 || lnver != 2 || h != 1),
1462 if (lrset == LR_SHIFT_FLOW)
1463 set_parameters (tb, h, tb->lnum[h], tb->rnum[h], lrnver, snum012 + lrset,
1464 tb->lbytes, tb->rbytes);
1466 set_parameters (tb, h, tb->lnum[h] - ((tb->lbytes == -1) ? 0 : 1),
1467 tb->rnum[h] - ((tb->rbytes == -1) ? 0 : 1), lrnver, snum012 + lrset, -1, -1);
1472 /* if shifting doesn't lead to better packing then don't shift */
1475 set_parameters (tb, h, 0, 0, nver, snum012 + nset, -1, -1);
1480 /* now we know that for better packing shifting in only one
1481 direction either to the left or to the right is required */
1483 /* if shifting to the left is better than shifting to the right */
1490 /* if shifting to the right is better than shifting to the left */
1493 SET_PAR_SHIFT_RIGHT;
1498 /* now shifting in either direction gives the same number
1499 of nodes and we can make use of the cached neighbors */
1500 if (is_left_neighbor_in_cache (tb,h))
1506 /* shift to the right independently on whether the right neighbor in cache or not */
1507 SET_PAR_SHIFT_RIGHT;
1513 /* Check whether current node S[h] is balanced when Decreasing its size by
1514 * Deleting or Cutting for INTERNAL node of S+tree.
1515 * Calculate parameters for balancing for current level h.
1517 * tb tree_balance structure;
1518 * h current level of the node;
1519 * inum item number in S[h];
1520 * mode i - insert, p - paste;
1521 * Returns: 1 - schedule occurred;
1522 * 0 - balancing for higher levels needed;
1523 * -1 - no balancing for higher levels needed;
1524 * -2 - no disk space.
1526 * Note: Items of internal nodes have fixed size, so the balance condition for
1527 * the internal part of S+tree is as for the B-trees.
1529 static int dc_check_balance_internal (struct tree_balance * tb, int h)
1531 struct virtual_node * vn = tb->tb_vn;
1533 /* Sh is the node whose balance is currently being checked,
1534 and Fh is its father. */
1535 struct buffer_head * Sh, * Fh;
1538 int lfree, rfree /* free space in L and R */;
1540 Sh = PATH_H_PBUFFER (tb->tb_path, h);
1541 Fh = PATH_H_PPARENT (tb->tb_path, h);
1543 maxsize = MAX_CHILD_SIZE(Sh);
1545 /* using tb->insert_size[h], which is negative in this case, create_virtual_node calculates: */
1546 /* new_nr_item = number of items node would have if operation is */
1547 /* performed without balancing (new_nr_item); */
1548 create_virtual_node (tb, h);
1551 { /* S[h] is the root. */
1552 if ( vn->vn_nr_item > 0 )
1554 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1555 return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */
1557 /* new_nr_item == 0.
1558 * Current root will be deleted resulting in
1559 * decrementing the tree height. */
1560 set_parameters (tb, h, 0, 0, 0, NULL, -1, -1);
1564 if ( (n_ret_value = get_parents(tb,h)) != CARRY_ON )
1568 /* get free space of neighbors */
1569 rfree = get_rfree (tb, h);
1570 lfree = get_lfree (tb, h);
1572 /* determine maximal number of items we can fit into neighbors */
1573 check_left (tb, h, lfree);
1574 check_right (tb, h, rfree);
1577 if ( vn->vn_nr_item >= MIN_NR_KEY(Sh) )
1578 { /* Balance condition for the internal node is valid.
1579 * In this case we balance only if it leads to better packing. */
1580 if ( vn->vn_nr_item == MIN_NR_KEY(Sh) )
1581 { /* Here we join S[h] with one of its neighbors,
1582 * which is impossible with greater values of new_nr_item. */
1583 if ( tb->lnum[h] >= vn->vn_nr_item + 1 )
1585 /* All contents of S[h] can be moved to L[h]. */
1589 order_L = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1590 n = dc_size(B_N_CHILD(tb->FL[h],order_L)) / (DC_SIZE + KEY_SIZE);
1591 set_parameters (tb, h, -n-1, 0, 0, NULL, -1, -1);
1595 if ( tb->rnum[h] >= vn->vn_nr_item + 1 )
1597 /* All contents of S[h] can be moved to R[h]. */
1601 order_R = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==B_NR_ITEMS(Fh)) ? 0 : n + 1;
1602 n = dc_size(B_N_CHILD(tb->FR[h],order_R)) / (DC_SIZE + KEY_SIZE);
1603 set_parameters (tb, h, 0, -n-1, 0, NULL, -1, -1);
1608 if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)
1610 /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
1613 to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 -
1614 (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
1615 set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1);
1619 /* Balancing does not lead to better packing. */
1620 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1621 return NO_BALANCING_NEEDED;
1624 /* Current node contain insufficient number of items. Balancing is required. */
1625 /* Check whether we can merge S[h] with left neighbor. */
1626 if (tb->lnum[h] >= vn->vn_nr_item + 1)
1627 if (is_left_neighbor_in_cache (tb,h) || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h])
1632 order_L = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1633 n = dc_size(B_N_CHILD(tb->FL[h],order_L)) / (DC_SIZE + KEY_SIZE);
1634 set_parameters (tb, h, -n-1, 0, 0, NULL, -1, -1);
1638 /* Check whether we can merge S[h] with right neighbor. */
1639 if (tb->rnum[h] >= vn->vn_nr_item + 1)
1644 order_R = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==B_NR_ITEMS(Fh)) ? 0 : (n + 1);
1645 n = dc_size(B_N_CHILD(tb->FR[h],order_R)) / (DC_SIZE + KEY_SIZE);
1646 set_parameters (tb, h, 0, -n-1, 0, NULL, -1, -1);
1650 /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
1651 if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)
1655 to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 -
1656 (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
1657 set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1);
1661 /* For internal nodes try to borrow item from a neighbor */
1662 RFALSE( !tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root");
1664 /* Borrow one or two items from caching neighbor */
1665 if (is_left_neighbor_in_cache (tb,h) || !tb->FR[h])
1669 from_l = (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item + 1) / 2 - (vn->vn_nr_item + 1);
1670 set_parameters (tb, h, -from_l, 0, 1, NULL, -1, -1);
1674 set_parameters (tb, h, 0, -((MAX_NR_KEY(Sh)+1-tb->rnum[h]+vn->vn_nr_item+1)/2-(vn->vn_nr_item+1)), 1,
1680 /* Check whether current node S[h] is balanced when Decreasing its size by
1681 * Deleting or Truncating for LEAF node of S+tree.
1682 * Calculate parameters for balancing for current level h.
1684 * tb tree_balance structure;
1685 * h current level of the node;
1686 * inum item number in S[h];
1687 * mode i - insert, p - paste;
1688 * Returns: 1 - schedule occurred;
1689 * 0 - balancing for higher levels needed;
1690 * -1 - no balancing for higher levels needed;
1691 * -2 - no disk space.
1693 static int dc_check_balance_leaf (struct tree_balance * tb, int h)
1695 struct virtual_node * vn = tb->tb_vn;
1697 /* Number of bytes that must be deleted from
1698 (value is negative if bytes are deleted) buffer which
1699 contains node being balanced. The mnemonic is that the
1700 attempted change in node space used level is levbytes bytes. */
1702 /* the maximal item size */
1705 /* S0 is the node whose balance is currently being checked,
1706 and F0 is its father. */
1707 struct buffer_head * S0, * F0;
1708 int lfree, rfree /* free space in L and R */;
1710 S0 = PATH_H_PBUFFER (tb->tb_path, 0);
1711 F0 = PATH_H_PPARENT (tb->tb_path, 0);
1713 levbytes = tb->insert_size[h];
1715 maxsize = MAX_CHILD_SIZE(S0); /* maximal possible size of an item */
1718 { /* S[0] is the root now. */
1720 RFALSE( -levbytes >= maxsize - B_FREE_SPACE (S0),
1721 "vs-8240: attempt to create empty buffer tree");
1723 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1724 return NO_BALANCING_NEEDED;
1727 if ( (n_ret_value = get_parents(tb,h)) != CARRY_ON )
1730 /* get free space of neighbors */
1731 rfree = get_rfree (tb, h);
1732 lfree = get_lfree (tb, h);
1734 create_virtual_node (tb, h);
1736 /* if 3 leaves can be merge to one, set parameters and return */
1737 if (are_leaves_removable (tb, lfree, rfree))
1740 /* determine maximal number of items we can shift to the left/right neighbor
1741 and the maximal number of bytes that can flow to the left/right neighbor
1742 from the left/right most liquid item that cannot be shifted from S[0] entirely
1744 check_left (tb, h, lfree);
1745 check_right (tb, h, rfree);
1747 /* check whether we can merge S with left neighbor. */
1748 if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1)
1749 if (is_left_neighbor_in_cache (tb,h) ||
1750 ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) || /* S can not be merged with R */
1753 RFALSE( !tb->FL[h], "vs-8245: dc_check_balance_leaf: FL[h] must exist");
1755 /* set parameter to merge S[0] with its left neighbor */
1756 set_parameters (tb, h, -1, 0, 0, NULL, -1, -1);
1760 /* check whether we can merge S[0] with right neighbor. */
1761 if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) {
1762 set_parameters (tb, h, 0, -1, 0, NULL, -1, -1);
1766 /* All contents of S[0] can be moved to the neighbors (L[0] & R[0]). Set parameters and return */
1767 if (is_leaf_removable (tb))
1770 /* Balancing is not required. */
1771 tb->s0num = vn->vn_nr_item;
1772 set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
1773 return NO_BALANCING_NEEDED;
1778 /* Check whether current node S[h] is balanced when Decreasing its size by
1779 * Deleting or Cutting.
1780 * Calculate parameters for balancing for current level h.
1782 * tb tree_balance structure;
1783 * h current level of the node;
1784 * inum item number in S[h];
1785 * mode d - delete, c - cut.
1786 * Returns: 1 - schedule occurred;
1787 * 0 - balancing for higher levels needed;
1788 * -1 - no balancing for higher levels needed;
1789 * -2 - no disk space.
1791 static int dc_check_balance (struct tree_balance * tb, int h)
1793 RFALSE( ! (PATH_H_PBUFFER (tb->tb_path, h)), "vs-8250: S is not initialized");
1796 return dc_check_balance_internal (tb, h);
1798 return dc_check_balance_leaf (tb, h);
1803 /* Check whether current node S[h] is balanced.
1804 * Calculate parameters for balancing for current level h.
1807 * tb tree_balance structure:
1809 * tb is a large structure that must be read about in the header file
1810 * at the same time as this procedure if the reader is to successfully
1811 * understand this procedure
1813 * h current level of the node;
1814 * inum item number in S[h];
1815 * mode i - insert, p - paste, d - delete, c - cut.
1816 * Returns: 1 - schedule occurred;
1817 * 0 - balancing for higher levels needed;
1818 * -1 - no balancing for higher levels needed;
1819 * -2 - no disk space.
1821 static int check_balance (int mode,
1822 struct tree_balance * tb,
1826 struct item_head * ins_ih,
1830 struct virtual_node * vn;
1832 vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf);
1833 vn->vn_free_ptr = (char *)(tb->tb_vn + 1);
1835 vn->vn_affected_item_num = inum;
1836 vn->vn_pos_in_item = pos_in_item;
1837 vn->vn_ins_ih = ins_ih;
1840 RFALSE( mode == M_INSERT && !vn->vn_ins_ih,
1841 "vs-8255: ins_ih can not be 0 in insert mode");
1843 if ( tb->insert_size[h] > 0 )
1844 /* Calculate balance parameters when size of node is increasing. */
1845 return ip_check_balance (tb, h);
1847 /* Calculate balance parameters when size of node is decreasing. */
1848 return dc_check_balance (tb, h);
1853 /* Check whether parent at the path is the really parent of the current node.*/
1854 static int get_direct_parent(
1855 struct tree_balance * p_s_tb,
1858 struct buffer_head * p_s_bh;
1859 struct path * p_s_path = p_s_tb->tb_path;
1861 n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
1863 /* We are in the root or in the new root. */
1864 if ( n_path_offset <= FIRST_PATH_ELEMENT_OFFSET ) {
1866 RFALSE( n_path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
1867 "PAP-8260: illegal offset in the path");
1869 if ( PATH_OFFSET_PBUFFER(p_s_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
1870 SB_ROOT_BLOCK (p_s_tb->tb_sb) ) {
1871 /* Root is not changed. */
1872 PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1) = NULL;
1873 PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1) = 0;
1876 return REPEAT_SEARCH; /* Root is changed and we must recalculate the path. */
1879 if ( ! B_IS_IN_TREE(p_s_bh = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1)) )
1880 return REPEAT_SEARCH; /* Parent in the path is not in the tree. */
1882 if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1)) > B_NR_ITEMS(p_s_bh) )
1883 return REPEAT_SEARCH;
1885 if ( B_N_CHILD_NUM(p_s_bh, n_position) != PATH_OFFSET_PBUFFER(p_s_path, n_path_offset)->b_blocknr )
1886 /* Parent in the path is not parent of the current node in the tree. */
1887 return REPEAT_SEARCH;
1889 if ( buffer_locked(p_s_bh) ) {
1890 __wait_on_buffer(p_s_bh);
1891 if ( FILESYSTEM_CHANGED_TB (p_s_tb) )
1892 return REPEAT_SEARCH;
1895 return CARRY_ON; /* Parent in the path is unlocked and really parent of the current node. */
1899 /* Using lnum[n_h] and rnum[n_h] we should determine what neighbors
1901 * need in order to balance S[n_h], and get them if necessary.
1902 * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
1903 * CARRY_ON - schedule didn't occur while the function worked;
1905 static int get_neighbors(
1906 struct tree_balance * p_s_tb,
1909 int n_child_position,
1910 n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h + 1);
1911 unsigned long n_son_number;
1912 struct super_block * p_s_sb = p_s_tb->tb_sb;
1913 struct buffer_head * p_s_bh;
1916 PROC_INFO_INC( p_s_sb, get_neighbors[ n_h ] );
1918 if ( p_s_tb->lnum[n_h] ) {
1919 /* We need left neighbor to balance S[n_h]. */
1920 PROC_INFO_INC( p_s_sb, need_l_neighbor[ n_h ] );
1921 p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
1923 RFALSE( p_s_bh == p_s_tb->FL[n_h] &&
1924 ! PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset),
1925 "PAP-8270: invalid position in the parent");
1927 n_child_position = ( p_s_bh == p_s_tb->FL[n_h] ) ? p_s_tb->lkey[n_h] : B_NR_ITEMS (p_s_tb->FL[n_h]);
1928 n_son_number = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position);
1929 p_s_bh = reiserfs_bread(p_s_sb, n_son_number, p_s_sb->s_blocksize);
1932 if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
1933 decrement_bcount(p_s_bh);
1934 PROC_INFO_INC( p_s_sb, get_neighbors_restart[ n_h ] );
1935 return REPEAT_SEARCH;
1938 RFALSE( ! B_IS_IN_TREE(p_s_tb->FL[n_h]) ||
1939 n_child_position > B_NR_ITEMS(p_s_tb->FL[n_h]) ||
1940 B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position) !=
1941 p_s_bh->b_blocknr, "PAP-8275: invalid parent");
1942 RFALSE( ! B_IS_IN_TREE(p_s_bh), "PAP-8280: invalid child");
1944 B_FREE_SPACE (p_s_bh) != MAX_CHILD_SIZE (p_s_bh) - dc_size(B_N_CHILD (p_s_tb->FL[0],n_child_position)),
1945 "PAP-8290: invalid child size of left neighbor");
1947 decrement_bcount(p_s_tb->L[n_h]);
1948 p_s_tb->L[n_h] = p_s_bh;
1952 if ( p_s_tb->rnum[n_h] ) { /* We need right neighbor to balance S[n_path_offset]. */
1953 PROC_INFO_INC( p_s_sb, need_r_neighbor[ n_h ] );
1954 p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
1956 RFALSE( p_s_bh == p_s_tb->FR[n_h] &&
1957 PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset) >= B_NR_ITEMS(p_s_bh),
1958 "PAP-8295: invalid position in the parent");
1960 n_child_position = ( p_s_bh == p_s_tb->FR[n_h] ) ? p_s_tb->rkey[n_h] + 1 : 0;
1961 n_son_number = B_N_CHILD_NUM(p_s_tb->FR[n_h], n_child_position);
1962 p_s_bh = reiserfs_bread(p_s_sb, n_son_number, p_s_sb->s_blocksize);
1965 if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
1966 decrement_bcount(p_s_bh);
1967 PROC_INFO_INC( p_s_sb, get_neighbors_restart[ n_h ] );
1968 return REPEAT_SEARCH;
1970 decrement_bcount(p_s_tb->R[n_h]);
1971 p_s_tb->R[n_h] = p_s_bh;
1973 RFALSE( ! n_h && B_FREE_SPACE (p_s_bh) != MAX_CHILD_SIZE (p_s_bh) - dc_size(B_N_CHILD (p_s_tb->FR[0],n_child_position)),
1974 "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
1975 B_FREE_SPACE (p_s_bh), MAX_CHILD_SIZE (p_s_bh),
1976 dc_size(B_N_CHILD (p_s_tb->FR[0],n_child_position)));
1982 #ifdef CONFIG_REISERFS_CHECK
1983 void * reiserfs_kmalloc (size_t size, int flags, struct super_block * s)
1986 static size_t malloced;
1989 vp = kmalloc (size, flags);
1991 s->u.reiserfs_sb.s_kmallocs += size;
1992 if (s->u.reiserfs_sb.s_kmallocs > malloced + 200000) {
1993 reiserfs_warning (s, "vs-8301: reiserfs_kmalloc: allocated memory %d\n", s->u.reiserfs_sb.s_kmallocs);
1994 malloced = s->u.reiserfs_sb.s_kmallocs;
1997 /*printk ("malloc : size %d, allocated %d\n", size, s->u.reiserfs_sb.s_kmallocs);*/
2001 void reiserfs_kfree (const void * vp, size_t size, struct super_block * s)
2007 s->u.reiserfs_sb.s_kmallocs -= size;
2008 if (s->u.reiserfs_sb.s_kmallocs < 0)
2009 reiserfs_warning (s, "vs-8302: reiserfs_kfree: allocated memory %d\n", s->u.reiserfs_sb.s_kmallocs);
2015 static int get_virtual_node_size (struct super_block * sb, struct buffer_head * bh)
2017 int max_num_of_items;
2018 int max_num_of_entries;
2019 unsigned long blocksize = sb->s_blocksize;
2021 #define MIN_NAME_LEN 1
2023 max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN);
2024 max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) /
2025 (DEH_SIZE + MIN_NAME_LEN);
2027 return sizeof(struct virtual_node) +
2028 max(max_num_of_items * sizeof (struct virtual_item),
2029 sizeof (struct virtual_item) + sizeof(struct direntry_uarea) +
2030 (max_num_of_entries - 1) * sizeof (__u16));
2035 /* maybe we should fail balancing we are going to perform when kmalloc
2036 fails several times. But now it will loop until kmalloc gets
2038 static int get_mem_for_virtual_node (struct tree_balance * tb)
2044 size = get_virtual_node_size (tb->tb_sb, PATH_PLAST_BUFFER (tb->tb_path));
2046 if (size > tb->vn_buf_size) {
2047 /* we have to allocate more memory for virtual node */
2049 /* free memory allocated before */
2050 reiserfs_kfree (tb->vn_buf, tb->vn_buf_size, tb->tb_sb);
2051 /* this is not needed if kfree is atomic */
2055 /* virtual node requires now more memory */
2056 tb->vn_buf_size = size;
2058 /* get memory for virtual item */
2059 buf = reiserfs_kmalloc(size, GFP_ATOMIC, tb->tb_sb);
2061 /* getting memory with GFP_KERNEL priority may involve
2062 balancing now (due to indirect_to_direct conversion on
2063 dcache shrinking). So, release path and collected
2065 free_buffers_in_tb (tb);
2066 buf = reiserfs_kmalloc(size, GFP_NOFS, tb->tb_sb);
2068 #ifdef CONFIG_REISERFS_CHECK
2069 reiserfs_warning (tb->tb_sb, "vs-8345: get_mem_for_virtual_node: "
2070 "kmalloc failed. reiserfs kmalloced %d bytes\n",
2071 tb->tb_sb->u.reiserfs_sb.s_kmallocs);
2073 tb->vn_buf_size = 0;
2077 return REPEAT_SEARCH;
2083 if ( check_fs && FILESYSTEM_CHANGED_TB (tb) )
2084 return REPEAT_SEARCH;
2090 #ifdef CONFIG_REISERFS_CHECK
2091 static void tb_buffer_sanity_check (struct super_block * p_s_sb,
2092 struct buffer_head * p_s_bh,
2093 const char *descr, int level) {
2095 if (atomic_read (&(p_s_bh->b_count)) <= 0) {
2097 reiserfs_panic (p_s_sb, "tb_buffer_sanity_check(): negative or zero reference counter for buffer %s[%d] (%b)\n", descr, level, p_s_bh);
2100 if ( ! buffer_uptodate (p_s_bh) ) {
2101 reiserfs_panic (p_s_sb, "tb_buffer_sanity_check(): buffer is not up to date %s[%d] (%b)\n", descr, level, p_s_bh);
2104 if ( ! B_IS_IN_TREE (p_s_bh) ) {
2105 reiserfs_panic (p_s_sb, "tb_buffer_sanity_check(): buffer is not in tree %s[%d] (%b)\n", descr, level, p_s_bh);
2108 if (p_s_bh->b_dev != p_s_sb->s_dev ||
2109 p_s_bh->b_size != p_s_sb->s_blocksize ||
2110 p_s_bh->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) {
2111 reiserfs_panic (p_s_sb, "tb_buffer_sanity_check(): check failed for buffer %s[%d] (%b)\n", descr, level, p_s_bh);
2116 static void tb_buffer_sanity_check (struct super_block * p_s_sb,
2117 struct buffer_head * p_s_bh,
2118 const char *descr, int level)
2122 static void clear_all_dirty_bits(struct super_block *s,
2123 struct buffer_head *bh) {
2124 reiserfs_prepare_for_journal(s, bh, 0) ;
2127 static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
2129 struct buffer_head * locked;
2130 #ifdef CONFIG_REISERFS_CHECK
2131 int repeat_counter = 0;
2139 for ( i = p_s_tb->tb_path->path_length; !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i-- ) {
2140 if ( PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i) ) {
2141 /* if I understand correctly, we can only be sure the last buffer
2142 ** in the path is in the tree --clm
2144 #ifdef CONFIG_REISERFS_CHECK
2145 if (PATH_PLAST_BUFFER(p_s_tb->tb_path) ==
2146 PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) {
2147 tb_buffer_sanity_check (p_s_tb->tb_sb,
2148 PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i),
2150 p_s_tb->tb_path->path_length - i);
2153 clear_all_dirty_bits(p_s_tb->tb_sb,
2154 PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)) ;
2156 if ( buffer_locked (PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)) )
2157 locked = PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i);
2161 for ( i = 0; !locked && i < MAX_HEIGHT && p_s_tb->insert_size[i]; i++ ) {
2163 if (p_s_tb->lnum[i] ) {
2165 if ( p_s_tb->L[i] ) {
2166 tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->L[i], "L", i);
2167 clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->L[i]) ;
2168 if ( buffer_locked (p_s_tb->L[i]) )
2169 locked = p_s_tb->L[i];
2172 if ( !locked && p_s_tb->FL[i] ) {
2173 tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FL[i], "FL", i);
2174 clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FL[i]) ;
2175 if ( buffer_locked (p_s_tb->FL[i]) )
2176 locked = p_s_tb->FL[i];
2179 if ( !locked && p_s_tb->CFL[i] ) {
2180 tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFL[i], "CFL", i);
2181 clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFL[i]) ;
2182 if ( buffer_locked (p_s_tb->CFL[i]) )
2183 locked = p_s_tb->CFL[i];
2188 if ( !locked && (p_s_tb->rnum[i]) ) {
2190 if ( p_s_tb->R[i] ) {
2191 tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->R[i], "R", i);
2192 clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->R[i]) ;
2193 if ( buffer_locked (p_s_tb->R[i]) )
2194 locked = p_s_tb->R[i];
2198 if ( !locked && p_s_tb->FR[i] ) {
2199 tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FR[i], "FR", i);
2200 clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FR[i]) ;
2201 if ( buffer_locked (p_s_tb->FR[i]) )
2202 locked = p_s_tb->FR[i];
2205 if ( !locked && p_s_tb->CFR[i] ) {
2206 tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFR[i], "CFR", i);
2207 clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFR[i]) ;
2208 if ( buffer_locked (p_s_tb->CFR[i]) )
2209 locked = p_s_tb->CFR[i];
2213 /* as far as I can tell, this is not required. The FEB list seems
2214 ** to be full of newly allocated nodes, which will never be locked,
2215 ** dirty, or anything else.
2216 ** To be safe, I'm putting in the checks and waits in. For the moment,
2217 ** they are needed to keep the code in journal.c from complaining
2218 ** about the buffer. That code is inside CONFIG_REISERFS_CHECK as well.
2221 for ( i = 0; !locked && i < MAX_FEB_SIZE; i++ ) {
2222 if ( p_s_tb->FEB[i] ) {
2223 clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FEB[i]) ;
2224 if (buffer_locked(p_s_tb->FEB[i])) {
2225 locked = p_s_tb->FEB[i] ;
2231 #ifdef CONFIG_REISERFS_CHECK
2233 if ( (repeat_counter % 10000) == 0) {
2234 reiserfs_warning (p_s_tb->tb_sb, "wait_tb_buffers_until_released(): too many iterations waiting for buffer to unlock (%b)\n", locked);
2236 /* Don't loop forever. Try to recover from possible error. */
2238 return ( FILESYSTEM_CHANGED_TB (p_s_tb) ) ? REPEAT_SEARCH : CARRY_ON;
2241 __wait_on_buffer (locked);
2242 if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
2243 return REPEAT_SEARCH;
2253 /* Prepare for balancing, that is
2254 * get all necessary parents, and neighbors;
2255 * analyze what and where should be moved;
2256 * get sufficient number of new nodes;
2257 * Balancing will start only after all resources will be collected at a time.
2259 * When ported to SMP kernels, only at the last moment after all needed nodes
2260 * are collected in cache, will the resources be locked using the usual
2261 * textbook ordered lock acquisition algorithms. Note that ensuring that
2262 * this code neither write locks what it does not need to write lock nor locks out of order
2263 * will be a pain in the butt that could have been avoided. Grumble grumble. -Hans
2265 * fix is meant in the sense of render unchanging
2267 * Latency might be improved by first gathering a list of what buffers are needed
2268 * and then getting as many of them in parallel as possible? -Hans
2271 * op_mode i - insert, d - delete, c - cut (truncate), p - paste (append)
2272 * tb tree_balance structure;
2273 * inum item number in S[h];
2274 * pos_in_item - comment this if you can
2275 * ins_ih & ins_sd are used when inserting
2276 * Returns: 1 - schedule occurred while the function worked;
2277 * 0 - schedule didn't occur while the function worked;
2278 * -1 - if no_disk_space
2282 int fix_nodes (int n_op_mode,
2283 struct tree_balance * p_s_tb,
2284 struct item_head * p_s_ins_ih, // item head of item being inserted
2285 const void * data // inserted item or data to be pasted
2289 n_item_num = PATH_LAST_POSITION(p_s_tb->tb_path);
2292 /* we set wait_tb_buffers_run when we have to restore any dirty bits cleared
2293 ** during wait_tb_buffers_run
2295 int wait_tb_buffers_run = 0 ;
2297 struct buffer_head * p_s_tbS0 = PATH_PLAST_BUFFER(p_s_tb->tb_path);
2299 ++ p_s_tb -> tb_sb -> u.reiserfs_sb.s_fix_nodes;
2301 n_pos_in_item = p_s_tb->tb_path->pos_in_item;
2304 p_s_tb->fs_gen = get_generation (p_s_tb->tb_sb);
2306 /* we prepare and log the super here so it will already be in the
2307 ** transaction when do_balance needs to change it.
2308 ** This way do_balance won't have to schedule when trying to prepare
2309 ** the super for logging
2311 reiserfs_prepare_for_journal(p_s_tb->tb_sb,
2312 SB_BUFFER_WITH_SB(p_s_tb->tb_sb), 1) ;
2313 journal_mark_dirty(p_s_tb->transaction_handle, p_s_tb->tb_sb,
2314 SB_BUFFER_WITH_SB(p_s_tb->tb_sb)) ;
2315 if ( FILESYSTEM_CHANGED_TB (p_s_tb) )
2316 return REPEAT_SEARCH;
2318 /* if it possible in indirect_to_direct conversion */
2319 if (buffer_locked (p_s_tbS0)) {
2320 __wait_on_buffer (p_s_tbS0);
2321 if ( FILESYSTEM_CHANGED_TB (p_s_tb) )
2322 return REPEAT_SEARCH;
2325 #ifdef CONFIG_REISERFS_CHECK
2327 print_cur_tb ("fix_nodes");
2328 reiserfs_panic(p_s_tb->tb_sb,"PAP-8305: fix_nodes: there is pending do_balance");
2331 if (!buffer_uptodate (p_s_tbS0) || !B_IS_IN_TREE (p_s_tbS0)) {
2332 reiserfs_panic (p_s_tb->tb_sb, "PAP-8320: fix_nodes: S[0] (%b %z) is not uptodate "
2333 "at the beginning of fix_nodes or not in tree (mode %c)", p_s_tbS0, p_s_tbS0, n_op_mode);
2336 /* Check parameters. */
2337 switch (n_op_mode) {
2339 if ( n_item_num <= 0 || n_item_num > B_NR_ITEMS(p_s_tbS0) )
2340 reiserfs_panic(p_s_tb->tb_sb,"PAP-8330: fix_nodes: Incorrect item number %d (in S0 - %d) in case of insert",
2341 n_item_num, B_NR_ITEMS(p_s_tbS0));
2346 if ( n_item_num < 0 || n_item_num >= B_NR_ITEMS(p_s_tbS0) ) {
2347 print_block (p_s_tbS0, 0, -1, -1);
2348 printk("mode = %c insert_size = %d\n", n_op_mode, p_s_tb->insert_size[0]);
2349 reiserfs_panic(p_s_tb->tb_sb,"PAP-8335: fix_nodes: Incorrect item number(%d)", n_item_num);
2353 reiserfs_panic(p_s_tb->tb_sb,"PAP-8340: fix_nodes: Incorrect mode of operation");
2357 if (get_mem_for_virtual_node (p_s_tb) == REPEAT_SEARCH)
2358 // FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat
2359 return REPEAT_SEARCH;
2362 /* Starting from the leaf level; for all levels n_h of the tree. */
2363 for ( n_h = 0; n_h < MAX_HEIGHT && p_s_tb->insert_size[n_h]; n_h++ ) {
2364 if ( (n_ret_value = get_direct_parent(p_s_tb, n_h)) != CARRY_ON ) {
2368 if ( (n_ret_value = check_balance (n_op_mode, p_s_tb, n_h, n_item_num,
2369 n_pos_in_item, p_s_ins_ih, data)) != CARRY_ON ) {
2370 if ( n_ret_value == NO_BALANCING_NEEDED ) {
2371 /* No balancing for higher levels needed. */
2372 if ( (n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON ) {
2375 if ( n_h != MAX_HEIGHT - 1 )
2376 p_s_tb->insert_size[n_h + 1] = 0;
2377 /* ok, analysis and resource gathering are complete */
2383 if ( (n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON ) {
2387 if ( (n_ret_value = get_empty_nodes(p_s_tb, n_h)) != CARRY_ON ) {
2388 goto repeat; /* No disk space, or schedule occurred and
2389 analysis may be invalid and needs to be redone. */
2392 if ( ! PATH_H_PBUFFER(p_s_tb->tb_path, n_h) ) {
2393 /* We have a positive insert size but no nodes exist on this
2394 level, this means that we are creating a new root. */
2396 RFALSE( p_s_tb->blknum[n_h] != 1,
2397 "PAP-8350: creating new empty root");
2399 if ( n_h < MAX_HEIGHT - 1 )
2400 p_s_tb->insert_size[n_h + 1] = 0;
2403 if ( ! PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1) ) {
2404 if ( p_s_tb->blknum[n_h] > 1 ) {
2405 /* The tree needs to be grown, so this node S[n_h]
2406 which is the root node is split into two nodes,
2407 and a new node (S[n_h+1]) will be created to
2408 become the root node. */
2410 RFALSE( n_h == MAX_HEIGHT - 1,
2411 "PAP-8355: attempt to create too high of a tree");
2413 p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1) + DC_SIZE;
2416 if ( n_h < MAX_HEIGHT - 1 )
2417 p_s_tb->insert_size[n_h + 1] = 0;
2420 p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1);
2424 windex = push_journal_writer("fix_nodes") ;
2425 if ((n_ret_value = wait_tb_buffers_until_unlocked (p_s_tb)) == CARRY_ON) {
2426 pop_journal_writer(windex) ;
2427 if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
2428 wait_tb_buffers_run = 1 ;
2429 n_ret_value = REPEAT_SEARCH ;
2435 wait_tb_buffers_run = 1 ;
2436 pop_journal_writer(windex) ;
2441 // fix_nodes was unable to perform its calculation due to
2442 // filesystem got changed under us, lack of free disk space or i/o
2443 // failure. If the first is the case - the search will be
2444 // repeated. For now - free all resources acquired so far except
2445 // for the new allocated nodes
2449 /* Release path buffers. */
2450 if (wait_tb_buffers_run) {
2451 pathrelse_and_restore(p_s_tb->tb_sb, p_s_tb->tb_path) ;
2453 pathrelse (p_s_tb->tb_path);
2455 /* brelse all resources collected for balancing */
2456 for ( i = 0; i < MAX_HEIGHT; i++ ) {
2457 if (wait_tb_buffers_run) {
2458 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->L[i]);
2459 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->R[i]);
2460 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->FL[i]);
2461 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->FR[i]);
2462 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->CFL[i]);
2463 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->CFR[i]);
2466 brelse (p_s_tb->L[i]);p_s_tb->L[i] = 0;
2467 brelse (p_s_tb->R[i]);p_s_tb->R[i] = 0;
2468 brelse (p_s_tb->FL[i]);p_s_tb->FL[i] = 0;
2469 brelse (p_s_tb->FR[i]);p_s_tb->FR[i] = 0;
2470 brelse (p_s_tb->CFL[i]);p_s_tb->CFL[i] = 0;
2471 brelse (p_s_tb->CFR[i]);p_s_tb->CFR[i] = 0;
2474 if (wait_tb_buffers_run) {
2475 for ( i = 0; i < MAX_FEB_SIZE; i++ ) {
2476 if ( p_s_tb->FEB[i] ) {
2477 reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
2488 /* Anatoly will probably forgive me renaming p_s_tb to tb. I just
2489 wanted to make lines shorter */
2490 void unfix_nodes (struct tree_balance * tb)
2494 /* Release path buffers. */
2495 pathrelse_and_restore (tb->tb_sb, tb->tb_path);
2497 /* brelse all resources collected for balancing */
2498 for ( i = 0; i < MAX_HEIGHT; i++ ) {
2499 reiserfs_restore_prepared_buffer (tb->tb_sb, tb->L[i]);
2500 reiserfs_restore_prepared_buffer (tb->tb_sb, tb->R[i]);
2501 reiserfs_restore_prepared_buffer (tb->tb_sb, tb->FL[i]);
2502 reiserfs_restore_prepared_buffer (tb->tb_sb, tb->FR[i]);
2503 reiserfs_restore_prepared_buffer (tb->tb_sb, tb->CFL[i]);
2504 reiserfs_restore_prepared_buffer (tb->tb_sb, tb->CFR[i]);
2510 brelse (tb->CFL[i]);
2511 brelse (tb->CFR[i]);
2514 /* deal with list of allocated (used and unused) nodes */
2515 for ( i = 0; i < MAX_FEB_SIZE; i++ ) {
2517 unsigned long blocknr = tb->FEB[i]->b_blocknr ;
2518 /* de-allocated block which was not used by balancing and
2519 bforget about buffer for it */
2520 brelse (tb->FEB[i]);
2521 reiserfs_free_block (tb->transaction_handle, blocknr);
2524 /* release used as new nodes including a new root */
2525 brelse (tb->used[i]);
2530 reiserfs_kfree (tb->vn_buf, tb->vn_buf_size, tb->tb_sb);