Commit e7c4c676 authored by Zardosht Kasheff's avatar Zardosht Kasheff Committed by Yoni Fogel
Browse files

[t:4028], merge to main

git-svn-id: file:///svn/toku/tokudb@41142 c7de825b-a66e-492c-adef-691d508d4ae1
parent c13b4ad4
Loading
Loading
Loading
Loading
+6 −0
Original line number Diff line number Diff line
@@ -127,6 +127,7 @@ toku_pin_brtnode(
    ANCESTORS ancestors,
    const PIVOT_BOUNDS bounds,
    BRTNODE_FETCH_EXTRA bfe,
    BOOL may_modify_node,
    BOOL apply_ancestor_messages, // this BOOL is probably temporary, for #3972, once we know how range query estimates work, will revisit this
    BRTNODE *node_p,
    BOOL* msgs_applied)
@@ -143,6 +144,7 @@ toku_pin_brtnode(
            toku_brtnode_fetch_callback,
            toku_brtnode_pf_req_callback,
            toku_brtnode_pf_callback,
            may_modify_node,
            bfe, //read_extraargs
            unlockers);
    if (r==0) {
@@ -168,6 +170,7 @@ toku_pin_brtnode_holding_lock(
    const PIVOT_BOUNDS bounds,
    BRTNODE_FETCH_EXTRA bfe,
    BOOL apply_ancestor_messages, // this BOOL is probably temporary, for #3972, once we know how range query estimates work, will revisit this
    BOOL may_modify_node,
    BRTNODE *node_p)
{
    void *node_v;
@@ -181,6 +184,7 @@ toku_pin_brtnode_holding_lock(
        toku_brtnode_fetch_callback,
        toku_brtnode_pf_req_callback,
        toku_brtnode_pf_callback,
        may_modify_node,
        bfe
        );
    assert(r==0);
@@ -196,6 +200,7 @@ toku_pin_brtnode_off_client_thread(
    BLOCKNUM blocknum,
    u_int32_t fullhash,
    BRTNODE_FETCH_EXTRA bfe,
    BOOL may_modify_node,
    u_int32_t num_dependent_nodes,
    BRTNODE* dependent_nodes,
    BRTNODE *node_p)
@@ -222,6 +227,7 @@ toku_pin_brtnode_off_client_thread(
        toku_brtnode_fetch_callback,
        toku_brtnode_pf_req_callback,
        toku_brtnode_pf_callback,
        may_modify_node,
        bfe,
        num_dependent_nodes,
        dependent_cf,
+3 −0
Original line number Diff line number Diff line
@@ -71,6 +71,7 @@ toku_pin_brtnode(
    ANCESTORS ancestors,
    const PIVOT_BOUNDS pbounds,
    BRTNODE_FETCH_EXTRA bfe,
    BOOL may_modify_node,
    BOOL apply_ancestor_messages, // this BOOL is probably temporary, for #3972, once we know how range query estimates work, will revisit this
    BRTNODE *node_p,
    BOOL* msgs_applied
@@ -88,6 +89,7 @@ toku_pin_brtnode_holding_lock(
    const PIVOT_BOUNDS pbounds,
    BRTNODE_FETCH_EXTRA bfe,
    BOOL apply_ancestor_messages,
    BOOL may_modify_node,
    BRTNODE *node_p
    );

@@ -104,6 +106,7 @@ toku_pin_brtnode_off_client_thread(
    BLOCKNUM blocknum,
    u_int32_t fullhash,
    BRTNODE_FETCH_EXTRA bfe,
    BOOL may_modify_node,
    u_int32_t num_dependent_nodes,
    BRTNODE* dependent_nodes,
    BRTNODE *node_p
+4 −8
Original line number Diff line number Diff line
@@ -400,7 +400,7 @@ ct_maybe_merge_child(struct flusher_advice *fa,
            CACHEKEY *rootp = toku_calculate_root_offset_pointer(h, &fullhash);
            struct brtnode_fetch_extra bfe;
            fill_bfe_for_full_read(&bfe, h);
            toku_pin_brtnode_off_client_thread(h, *rootp, fullhash, &bfe, 0,NULL, &root_node);
            toku_pin_brtnode_off_client_thread(h, *rootp, fullhash, &bfe, TRUE, 0, NULL, &root_node);
            toku_assert_entire_node_in_memory(root_node);

            toku_brtheader_release_treelock(h);
@@ -512,8 +512,6 @@ handle_split_of_child(
    BP_BLOCKNUM(node, childnum+1) = childb->thisnodename;
    BP_WORKDONE(node, childnum+1)  = 0;
    BP_STATE(node,childnum+1) = PT_AVAIL;
    BP_START(node,childnum+1) = 0;
    BP_SIZE(node,childnum+1) = 0;

    set_BNC(node, childnum+1, toku_create_empty_nl());

@@ -824,8 +822,6 @@ brtleaf_split(
            for (int i = 0; i < num_children_in_b; i++) {
                BP_BLOCKNUM(B,i).b = 0;
                BP_STATE(B,i) = PT_AVAIL;
                BP_START(B,i) = 0;
                BP_SIZE(B,i) = 0;
                BP_WORKDONE(B,i) = 0;
                set_BLB(B, i, toku_create_empty_bn());
            }
@@ -1361,7 +1357,7 @@ brt_merge_child(
        u_int32_t childfullhash = compute_child_fullhash(h->cf, node, childnuma);
        struct brtnode_fetch_extra bfe;
        fill_bfe_for_full_read(&bfe, h);
        toku_pin_brtnode_off_client_thread(h, BP_BLOCKNUM(node, childnuma), childfullhash, &bfe, 1, &node, &childa);
        toku_pin_brtnode_off_client_thread(h, BP_BLOCKNUM(node, childnuma), childfullhash, &bfe, TRUE, 1, &node, &childa);
    }
    // for test
    call_flusher_thread_callback(ft_flush_before_pin_second_node_for_merge);
@@ -1372,7 +1368,7 @@ brt_merge_child(
        u_int32_t childfullhash = compute_child_fullhash(h->cf, node, childnumb);
        struct brtnode_fetch_extra bfe;
        fill_bfe_for_full_read(&bfe, h);
        toku_pin_brtnode_off_client_thread(h, BP_BLOCKNUM(node, childnumb), childfullhash, &bfe, 2, dep_nodes, &childb);
        toku_pin_brtnode_off_client_thread(h, BP_BLOCKNUM(node, childnumb), childfullhash, &bfe, TRUE, 2, dep_nodes, &childb);
    }

    if (toku_bnc_n_entries(BNC(node,childnuma))>0) {
@@ -1498,7 +1494,7 @@ flush_some_child(
    // Note that we don't read the entire node into memory yet.
    // The idea is let's try to do the minimum work before releasing the parent lock
    fill_bfe_for_min_read(&bfe, h);
    toku_pin_brtnode_off_client_thread(h, targetchild, childfullhash, &bfe, 1, &parent, &child);
    toku_pin_brtnode_off_client_thread(h, targetchild, childfullhash, &bfe, TRUE, 1, &parent, &child);

    // for test
    call_flusher_thread_callback(ft_flush_after_child_pin);
+1 −0
Original line number Diff line number Diff line
@@ -280,6 +280,7 @@ toku_brt_hot_optimize(BRT brt,
                                               (BLOCKNUM) *rootp,
                                               fullhash,
                                               &bfe,
                                               TRUE, 
                                               0,
                                               NULL,
                                               &root);
+33 −19
Original line number Diff line number Diff line
@@ -188,6 +188,22 @@ typedef struct __attribute__((__packed__)) brtnode_child_pointer {
    } u;
} BRTNODE_CHILD_POINTER;


struct brtnode_disk_data {
    //
    // stores the offset to the beginning of the partition on disk from the brtnode, and the length, needed to read a partition off of disk
    // the value is only meaningful if the node is clean. If the node is dirty, then the value is meaningless
    //  The START is the distance from the end of the compressed node_info data, to the beginning of the compressed partition
    //  The SIZE is the size of the compressed partition.
    // Rationale:  We cannot store the size from the beginning of the node since we don't know how big the header will be.
    //  However, later when we are doing aligned writes, we won't be able to store the size from the end since we want things to align.
    u_int32_t start;
    u_int32_t size;
};
#define BP_START(node_dd,i) ((node_dd)[i].start)
#define BP_SIZE(node_dd,i) ((node_dd)[i].size)


// a brtnode partition, associated with a child of a node
struct   __attribute__((__packed__)) brtnode_partition {
    // the following three variables are used for nonleaf nodes
@@ -203,14 +219,6 @@ struct __attribute__((__packed__)) brtnode_partition {
    //
    enum pt_state state; // make this an enum to make debugging easier.  
    //
    // stores the offset to the beginning of the partition on disk from the brtnode, and the length, needed to read a partition off of disk
    // the value is only meaningful if the node is clean. If the node is dirty, then the value is meaningless
    //  The START is the distance from the end of the compressed node_info data, to the beginning of the compressed partition
    //  The SIZE is the size of the compressed partition.
    // Rationale:  We cannot store the size from the beginning of the node since we don't know how big the header will be.
    //  However, later when we are doing aligned writes, we won't be able to store the size from the end since we want things to align.
    u_int32_t start,size;
    //
    // pointer to the partition. Depending on the state, they may be different things
    // if state == PT_INVALID, then the node was just initialized and ptr == NULL
    // if state == PT_ON_DISK, then ptr == NULL
@@ -258,11 +266,7 @@ struct brtnode {
// brtnode partition macros
// BP stands for brtnode_partition
#define BP_BLOCKNUM(node,i) ((node)->bp[i].blocknum)
#define BP_HAVE_FULLHASH(node,i) ((node)->bp[i].have_fullhash)
#define BP_FULLHASH(node,i) ((node)->bp[i].fullhash)
#define BP_STATE(node,i) ((node)->bp[i].state)
#define BP_START(node,i) ((node)->bp[i].start)
#define BP_SIZE(node,i) ((node)->bp[i].size)
#define BP_WORKDONE(node, i)((node)->bp[i].workdone)

//
@@ -448,18 +452,21 @@ toku_create_compressed_partition_from_available(
    int childnum, 
    SUB_BLOCK sb
    );
void rebalance_brtnode_leaf(BRTNODE node, unsigned int basementnodesize);
int toku_serialize_brtnode_to_memory (BRTNODE node,
                                      BRTNODE_DISK_DATA* ndd,
                                      unsigned int basementnodesize,
                                      BOOL do_rebalancing,
                              /*out*/ size_t *n_bytes_to_write,
                              /*out*/ char  **bytes_to_write);
int toku_serialize_brtnode_to(int fd, BLOCKNUM, BRTNODE node, struct brt_header *h, int n_workitems, int n_threads, BOOL for_checkpoint);
int toku_serialize_brtnode_to(int fd, BLOCKNUM, BRTNODE node, BRTNODE_DISK_DATA* ndd, BOOL do_rebalancing, struct brt_header *h, int n_workitems, int n_threads, BOOL for_checkpoint);
int toku_serialize_rollback_log_to (int fd, BLOCKNUM blocknum, ROLLBACK_LOG_NODE log,
                                    struct brt_header *h, int n_workitems, int n_threads,
                                    BOOL for_checkpoint);
int toku_deserialize_rollback_log_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash, ROLLBACK_LOG_NODE *logp, struct brt_header *h);
void toku_deserialize_bp_from_disk(BRTNODE node, int childnum, int fd, struct brtnode_fetch_extra* bfe);
void toku_deserialize_bp_from_disk(BRTNODE node, BRTNODE_DISK_DATA ndd, int childnum, int fd, struct brtnode_fetch_extra* bfe);
void toku_deserialize_bp_from_compressed(BRTNODE node, int childnum, DESCRIPTOR desc, brt_compare_func cmp);
int toku_deserialize_brtnode_from (int fd, BLOCKNUM off, u_int32_t /*fullhash*/, BRTNODE *brtnode, struct brtnode_fetch_extra* bfe);
int toku_deserialize_brtnode_from (int fd, BLOCKNUM off, u_int32_t /*fullhash*/, BRTNODE *brtnode, BRTNODE_DISK_DATA* ndd, struct brtnode_fetch_extra* bfe);
unsigned int toku_serialize_brtnode_size(BRTNODE node); /* How much space will it take? */
int toku_keycompare (bytevec key1, ITEMLEN key1len, bytevec key2, ITEMLEN key2len);

@@ -477,6 +484,8 @@ int toku_serialize_descriptor_contents_to_fd(int fd, const DESCRIPTOR desc, DISK
void toku_serialize_descriptor_contents_to_wbuf(struct wbuf *wb, const DESCRIPTOR desc);
BASEMENTNODE toku_create_empty_bn(void);
BASEMENTNODE toku_create_empty_bn_no_buffer(void); // create a basement node with a null buffer.
NONLEAF_CHILDINFO toku_clone_nl(NONLEAF_CHILDINFO orig_childinfo);
BASEMENTNODE toku_clone_bn(BASEMENTNODE orig_bn);
NONLEAF_CHILDINFO toku_create_empty_nl(void);
// FIXME needs toku prefix
void destroy_basement_node (BASEMENTNODE bn);
@@ -529,12 +538,13 @@ struct brtenv {
};

void toku_brt_status_update_pivot_fetch_reason(struct brtnode_fetch_extra *bfe);
extern void toku_brtnode_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename, void *brtnode_v, void *extraargs, PAIR_ATTR size, PAIR_ATTR* new_size, BOOL write_me, BOOL keep_me, BOOL for_checkpoint);
extern int toku_brtnode_fetch_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename, u_int32_t fullhash, void **brtnode_pv, PAIR_ATTR *sizep, int*dirty, void*extraargs);
extern void toku_brtnode_pe_est_callback(void* brtnode_pv, long* bytes_freed_estimate, enum partial_eviction_cost *cost, void* write_extraargs);
extern void toku_brtnode_clone_callback(void* value_data, void** cloned_value_data, PAIR_ATTR* new_attr, BOOL for_checkpoint, void* write_extraargs);
extern void toku_brtnode_flush_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename, void *brtnode_v, void** UU(disk_data), void *extraargs, PAIR_ATTR size, PAIR_ATTR* new_size, BOOL write_me, BOOL keep_me, BOOL for_checkpoint, BOOL is_clone);
extern int toku_brtnode_fetch_callback (CACHEFILE cachefile, int fd, BLOCKNUM nodename, u_int32_t fullhash, void **brtnode_pv, void** UU(disk_data), PAIR_ATTR *sizep, int*dirty, void*extraargs);
extern void toku_brtnode_pe_est_callback(void* brtnode_pv, void* disk_data, long* bytes_freed_estimate, enum partial_eviction_cost *cost, void* write_extraargs);
extern int toku_brtnode_pe_callback (void *brtnode_pv, PAIR_ATTR old_attr, PAIR_ATTR* new_attr, void *extraargs);
extern BOOL toku_brtnode_pf_req_callback(void* brtnode_pv, void* read_extraargs);
int toku_brtnode_pf_callback(void* brtnode_pv, void* read_extraargs, int fd, PAIR_ATTR* sizep);
int toku_brtnode_pf_callback(void* brtnode_pv, void* UU(disk_data), void* read_extraargs, int fd, PAIR_ATTR* sizep);
extern int toku_brtnode_cleaner_callback( void *brtnode_pv, BLOCKNUM blocknum, u_int32_t fullhash, void *extraargs);
extern int toku_brt_alloc_init_header(BRT t, TOKUTXN txn);
extern int toku_read_brt_header_and_store_in_cachefile (BRT brt, CACHEFILE cf, LSN max_acceptable_lsn, struct brt_header **header, BOOL* was_open);
@@ -546,6 +556,7 @@ static inline CACHETABLE_WRITE_CALLBACK get_write_callbacks_for_node(struct brt_
    wc.pe_est_callback = toku_brtnode_pe_est_callback;
    wc.pe_callback = toku_brtnode_pe_callback;
    wc.cleaner_callback = toku_brtnode_cleaner_callback;
    wc.clone_callback = toku_brtnode_clone_callback;
    wc.write_extraargs = h;
    return wc;
}
@@ -900,6 +911,9 @@ typedef enum {
    BRT_STATUS_NUM_ROWS
} brt_status_entry;

void brt_begin_checkpoint(void);
void brt_end_checkpoint(void);

typedef struct {
    bool initialized;
    TOKU_ENGINE_STATUS_ROW_S status[BRT_STATUS_NUM_ROWS];
Loading