Commit 94db78ce authored by heikki@donna.mysql.fi's avatar heikki@donna.mysql.fi
Browse files

srv0srv.h Support raw disk partitions as data files

srv0start.c	Support raw disk partitions as data files
srv0srv.c	Support raw disk partitions as data files
row0purge.c	< 4 GB rows, doublewrite, hang fixes
row0row.c	< 4 GB rows, doublewrite, hang fixes
row0sel.c	< 4 GB rows, doublewrite, hang fixes
row0uins.c	< 4 GB rows, doublewrite, hang fixes
row0umod.c	< 4 GB rows, doublewrite, hang fixes
row0undo.c	< 4 GB rows, doublewrite, hang fixes
row0upd.c	< 4 GB rows, doublewrite, hang fixes
srv0srv.c	< 4 GB rows, doublewrite, hang fixes
srv0start.c	< 4 GB rows, doublewrite, hang fixes
sync0rw.c	< 4 GB rows, doublewrite, hang fixes
sync0sync.c	< 4 GB rows, doublewrite, hang fixes
trx0purge.c	< 4 GB rows, doublewrite, hang fixes
trx0rec.c	< 4 GB rows, doublewrite, hang fixes
trx0sys.c	< 4 GB rows, doublewrite, hang fixes
btr0btr.c	< 4 GB rows, doublewrite, hang fixes
btr0cur.c	< 4 GB rows, doublewrite, hang fixes
buf0buf.c	< 4 GB rows, doublewrite, hang fixes
buf0flu.c	< 4 GB rows, doublewrite, hang fixes
buf0rea.c	< 4 GB rows, doublewrite, hang fixes
data0data.c	< 4 GB rows, doublewrite, hang fixes
fil0fil.c	< 4 GB rows, doublewrite, hang fixes
fsp0fsp.c	< 4 GB rows, doublewrite, hang fixes
ibuf0ibuf.c	< 4 GB rows, doublewrite, hang fixes
lock0lock.c	< 4 GB rows, doublewrite, hang fixes
log0log.c	< 4 GB rows, doublewrite, hang fixes
log0recv.c	< 4 GB rows, doublewrite, hang fixes
os0file.c	< 4 GB rows, doublewrite, hang fixes
page0cur.c	< 4 GB rows, doublewrite, hang fixes
pars0pars.c	< 4 GB rows, doublewrite, hang fixes
rem0cmp.c	< 4 GB rows, doublewrite, hang fixes
rem0rec.c	< 4 GB rows, doublewrite, hang fixes
row0ins.c	< 4 GB rows, doublewrite, hang fixes
row0mysql.c	< 4 GB rows, doublewrite, hang fixes
univ.i  	< 4 GB rows, doublewrite, hang fixes
data0data.ic	< 4 GB rows, doublewrite, hang fixes
mach0data.ic	< 4 GB rows, doublewrite, hang fixes
rem0rec.ic	< 4 GB rows, doublewrite, hang fixes
row0upd.ic	< 4 GB rows, doublewrite, hang fixes
trx0rec.ic	< 4 GB rows, doublewrite, hang fixes
rem0cmp.h	< 4 GB rows, doublewrite, hang fixes
rem0rec.h	< 4 GB rows, doublewrite, hang fixes
row0ins.h	< 4 GB rows, doublewrite, hang fixes
row0mysql.h	< 4 GB rows, doublewrite, hang fixes
row0row.h	< 4 GB rows, doublewrite, hang fixes
row0upd.h	< 4 GB rows, doublewrite, hang fixes
srv0srv.h	< 4 GB rows, doublewrite, hang fixes
sync0sync.h	< 4 GB rows, doublewrite, hang fixes
trx0rec.h	< 4 GB rows, doublewrite, hang fixes
trx0sys.h	< 4 GB rows, doublewrite, hang fixes
trx0types.h	< 4 GB rows, doublewrite, hang fixes
trx0undo.h	< 4 GB rows, doublewrite, hang fixes
ut0dbg.h	< 4 GB rows, doublewrite, hang fixes
ut0ut.h 	< 4 GB rows, doublewrite, hang fixes
btr0btr.h	< 4 GB rows, doublewrite, hang fixes
btr0cur.h	< 4 GB rows, doublewrite, hang fixes
buf0buf.h	< 4 GB rows, doublewrite, hang fixes
buf0flu.h	< 4 GB rows, doublewrite, hang fixes
data0data.h	< 4 GB rows, doublewrite, hang fixes
dict0mem.h	< 4 GB rows, doublewrite, hang fixes
fil0fil.h	< 4 GB rows, doublewrite, hang fixes
fsp0fsp.h	< 4 GB rows, doublewrite, hang fixes
os0file.h	< 4 GB rows, doublewrite, hang fixes
parent 596d69b5
Loading
Loading
Loading
Loading
+36 −35
Original line number Diff line number Diff line
@@ -71,30 +71,6 @@ btr_page_create(
	dict_tree_t*	tree,	/* in: index tree */
	mtr_t*		mtr);	/* in: mtr */
/******************************************************************
Allocates a new file page to be used in an index tree. */
static
page_t*
btr_page_alloc(
/*===========*/
					/* out: new allocated page,
					x-latched */
	dict_tree_t*	tree,		/* in: index tree */
	ulint		hint_page_no,	/* in: hint of a good page */
	byte		file_direction,	/* in: direction where a possible
					page split is made */
	ulint		level,		/* in: level where the page is placed
					in the tree */
	mtr_t*		mtr);		/* in: mtr */
/******************************************************************
Frees a file page used in an index tree. */
static
void
btr_page_free(
/*==========*/
	dict_tree_t*	tree,	/* in: index tree */
	page_t*		page,	/* in, own: page to be freed */	
	mtr_t*		mtr);	/* in: mtr */
/******************************************************************
Sets the child node file address in a node pointer. */
UNIV_INLINE
void
@@ -319,11 +295,12 @@ btr_page_alloc_for_ibuf(
/******************************************************************
Allocates a new file page to be used in an index tree. NOTE: we assume
that the caller has made the reservation for free extents! */
static

page_t*
btr_page_alloc(
/*===========*/
					/* out: new allocated page, x-latched */
					/* out: new allocated page, x-latched;
					NULL if out of space */
	dict_tree_t*	tree,		/* in: index tree */
	ulint		hint_page_no,	/* in: hint of a good page */
	byte		file_direction,	/* in: direction where a possible
@@ -358,7 +335,10 @@ btr_page_alloc(
	
	new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no,
						file_direction, TRUE, mtr);
	ut_a(new_page_no != FIL_NULL);
	if (new_page_no == FIL_NULL) {

		return(NULL);
	}

	new_page = buf_page_get(dict_tree_get_space(tree), new_page_no,
							RW_X_LATCH, mtr);
@@ -435,20 +415,22 @@ btr_page_free_for_ibuf(
}

/******************************************************************
Frees a file page used in an index tree. */
static
Frees a file page used in an index tree. Can be used also to (BLOB)
external storage pages, because the page level 0 can be given as an
argument. */

void
btr_page_free(
/*==========*/
btr_page_free_low(
/*==============*/
	dict_tree_t*	tree,	/* in: index tree */
	page_t*		page,	/* in: page to be freed, x-latched */	
	ulint		level,	/* in: page level */
	mtr_t*		mtr)	/* in: mtr */
{
	fseg_header_t*	seg_header;
	page_t*		root;
	ulint		space;
	ulint		page_no;
	ulint		level;

	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
			      				MTR_MEMO_PAGE_X_FIX));
@@ -466,8 +448,6 @@ btr_page_free(

	root = btr_root_get(tree, mtr);
	
	level = btr_page_get_level(page, mtr);
	
	if (level == 0) {
		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
	} else {
@@ -480,6 +460,26 @@ btr_page_free(
	fseg_free_page(seg_header, space, page_no, mtr);
}	

/******************************************************************
Frees a file page used in an index tree. NOTE: cannot free field external
storage pages because the page must contain info on its level. */

void
btr_page_free(
/*==========*/
	dict_tree_t*	tree,	/* in: index tree */
	page_t*		page,	/* in: page to be freed, x-latched */	
	mtr_t*		mtr)	/* in: mtr */
{
	ulint		level;

	ut_ad(mtr_memo_contains(mtr, buf_block_align(page),
			      				MTR_MEMO_PAGE_X_FIX));
	level = btr_page_get_level(page, mtr);
	
	btr_page_free_low(tree, page, level, mtr);
}	

/******************************************************************
Sets the child node file address in a node pointer. */
UNIV_INLINE
@@ -1276,6 +1276,7 @@ btr_insert_on_non_leaf_level(
	dtuple_t*	tuple,	/* in: the record to be inserted */
	mtr_t*		mtr)	/* in: mtr */
{
	big_rec_t*	dummy_big_rec;
	btr_cur_t	cursor;		
	ulint		err;
	rec_t*		rec;
@@ -1294,7 +1295,7 @@ btr_insert_on_non_leaf_level(
					| BTR_KEEP_SYS_FLAG
					| BTR_NO_UNDO_LOG_FLAG,
					&cursor, tuple,
					&rec, NULL, mtr);
					&rec, &dummy_big_rec, NULL, mtr);
	ut_a(err == DB_SUCCESS);
}

+790 −30

File changed.

Preview size limit exceeded, changes collapsed.

+58 −23
Original line number Diff line number Diff line
@@ -219,13 +219,43 @@ buf_calc_page_checksum(
  	ulint checksum;

  	checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN);
  + ut_fold_binary(page + FIL_PAGE_DATA, UNIV_PAGE_SIZE - FIL_PAGE_DATA
  		+ ut_fold_binary(page + FIL_PAGE_DATA,
				UNIV_PAGE_SIZE - FIL_PAGE_DATA
				- FIL_PAGE_END_LSN);
  	checksum = checksum & 0xFFFFFFFF;

  	return(checksum);
}

/************************************************************************
Checks if a page is corrupt. */

ibool
buf_page_is_corrupted(
/*==================*/
				/* out: TRUE if corrupted */
	byte*	read_buf)	/* in: a database page */
{
	ulint	checksum;

	checksum = buf_calc_page_checksum(read_buf);

	if ((mach_read_from_4(read_buf + FIL_PAGE_LSN + 4)
		    		!= mach_read_from_4(read_buf + UNIV_PAGE_SIZE
					- FIL_PAGE_END_LSN + 4))
		|| (checksum != mach_read_from_4(read_buf
                                        + UNIV_PAGE_SIZE
					- FIL_PAGE_END_LSN)
		    && mach_read_from_4(read_buf + FIL_PAGE_LSN)
			    	!= mach_read_from_4(read_buf
                                        + UNIV_PAGE_SIZE
						- FIL_PAGE_END_LSN))) {
		return(TRUE);
	}

	return(FALSE);
}

/************************************************************************
Initializes a buffer control block when the buf_pool is created. */
static
@@ -1265,28 +1295,16 @@ buf_page_io_complete(
	dulint		id;
	dict_index_t*	index;
	ulint		io_type;
	ulint           checksum;

	ut_ad(block);

	io_type = block->io_fix;

	if (io_type == BUF_IO_READ) {
		checksum = buf_calc_page_checksum(block->frame);

		/* From version 3.23.38 up we store the page checksum
		   to the 4 upper bytes of the page end lsn field */

		if ((mach_read_from_4(block->frame + FIL_PAGE_LSN + 4)
		    != mach_read_from_4(block->frame + UNIV_PAGE_SIZE
					- FIL_PAGE_END_LSN + 4))
		    || (checksum != mach_read_from_4(block->frame
                                        + UNIV_PAGE_SIZE
					- FIL_PAGE_END_LSN)
			&& mach_read_from_4(block->frame + FIL_PAGE_LSN)
			    != mach_read_from_4(block->frame
                                        + UNIV_PAGE_SIZE
						- FIL_PAGE_END_LSN))) {
		if (buf_page_is_corrupted(block->frame)) {
		  	fprintf(stderr,
			  "InnoDB: Database page corruption or a failed\n"
			  "InnoDB: file read of page %lu.\n", block->offset);
@@ -1601,11 +1619,28 @@ void
buf_print_io(void)
/*==============*/
{
	ulint	size;
	
	ut_ad(buf_pool);

	size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE;

	mutex_enter(&(buf_pool->mutex));
	
	printf("pages read %lu, created %lu, written %lu\n",
	printf("LRU list length %lu \n", UT_LIST_GET_LEN(buf_pool->LRU));
	printf("Free list length %lu \n", UT_LIST_GET_LEN(buf_pool->free));
	printf("Flush list length %lu \n",
				UT_LIST_GET_LEN(buf_pool->flush_list));
	printf("Buffer pool size in pages %lu\n", size);

	printf("Pending reads %lu \n", buf_pool->n_pend_reads);

	printf("Pending writes: LRU %lu, flush list %lu, single page %lu\n",
		buf_pool->n_flush[BUF_FLUSH_LRU],
		buf_pool->n_flush[BUF_FLUSH_LIST],
		buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);

	printf("Pages read %lu, created %lu, written %lu\n",
			buf_pool->n_pages_read, buf_pool->n_pages_created,
						buf_pool->n_pages_written);
	mutex_exit(&(buf_pool->mutex));
+171 −24
Original line number Diff line number Diff line
/******************************************************
The database buffer buf_pool flush algorithm

(c) 1995 Innobase Oy
(c) 1995-2001 Innobase Oy

Created 11/11/1995 Heikki Tuuri
*******************************************************/
@@ -15,7 +15,6 @@ Created 11/11/1995 Heikki Tuuri
#include "ut0byte.h"
#include "ut0lst.h"
#include "fil0fil.h"

#include "buf0buf.h"
#include "buf0lru.h"
#include "buf0rea.h"
@@ -195,9 +194,145 @@ buf_flush_write_complete(
}

/************************************************************************
Does an asynchronous write of a buffer page. NOTE: in simulated aio we must
call os_aio_simulated_wake_handler_threads after we have posted a batch
of writes! */
Flushes possible buffered writes from the doublewrite memory buffer to disk,
and also wakes up the aio thread if simulated aio is used. It is very
important to call this function after a batch of writes has been posted,
and also when we may have to wait for a page latch! Otherwise a deadlock
of threads can occur. */
static
void
buf_flush_buffered_writes(void)
/*===========================*/
{
	buf_block_t*	block;
	ulint		len;
	ulint		i;

	if (trx_doublewrite == NULL) {
		os_aio_simulated_wake_handler_threads();

		return;
	}
	
	mutex_enter(&(trx_doublewrite->mutex));

	/* Write first to doublewrite buffer blocks. We use synchronous
	aio and thus know that file write has been completed when the
	control returns. */

	if (trx_doublewrite->first_free == 0) {

		mutex_exit(&(trx_doublewrite->mutex));

		return;
	}

	if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
		len = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
	} else {
		len = trx_doublewrite->first_free * UNIV_PAGE_SIZE;
	}
	
	fil_io(OS_FILE_WRITE,
		TRUE, TRX_SYS_SPACE,
		trx_doublewrite->block1, 0, len,
		 	(void*)trx_doublewrite->write_buf, NULL);
	
	if (trx_doublewrite->first_free > TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
		len = (trx_doublewrite->first_free
			- TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE;
	
		fil_io(OS_FILE_WRITE,
			TRUE, TRX_SYS_SPACE,
			trx_doublewrite->block2, 0, len,
		 	(void*)(trx_doublewrite->write_buf
		 	+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE),
			NULL);
	}

	/* Now flush the doublewrite buffer data to disk */

	fil_flush(TRX_SYS_SPACE);

	/* We know that the writes have been flushed to disk now
	and in recovery we will find them in the doublewrite buffer
	blocks. Next do the writes to the intended positions. */

	for (i = 0; i < trx_doublewrite->first_free; i++) {
		block = trx_doublewrite->buf_block_arr[i];

		fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
			FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
		 			(void*)block->frame, (void*)block);
	}
	
	/* Wake possible simulated aio thread to actually post the
	writes to the operating system */

	os_aio_simulated_wake_handler_threads();

	/* Wait that all async writes to tablespaces have been posted to
	the OS */	
	
	os_aio_wait_until_no_pending_writes();

	/* Now we flush the data to disk (for example, with fsync) */

	fil_flush_file_spaces(FIL_TABLESPACE);

	/* We can now reuse the doublewrite memory buffer: */

	trx_doublewrite->first_free = 0;

	mutex_exit(&(trx_doublewrite->mutex));	
}

/************************************************************************
Posts a buffer page for writing. If the doublewrite memory buffer is
full, calls buf_flush_buffered_writes and waits for for free space to
appear. */
static
void
buf_flush_post_to_doublewrite_buf(
/*==============================*/
	buf_block_t*	block)	/* in: buffer block to write */
{
try_again:
	mutex_enter(&(trx_doublewrite->mutex));

	if (trx_doublewrite->first_free
				>= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
		mutex_exit(&(trx_doublewrite->mutex));

		buf_flush_buffered_writes();

		goto try_again;
	}

	ut_memcpy(trx_doublewrite->write_buf
				+ UNIV_PAGE_SIZE * trx_doublewrite->first_free,
			block->frame, UNIV_PAGE_SIZE);

	trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = block;

	trx_doublewrite->first_free++;

	if (trx_doublewrite->first_free
				>= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
		mutex_exit(&(trx_doublewrite->mutex));

		buf_flush_buffered_writes();

		return;
	}

	mutex_exit(&(trx_doublewrite->mutex));
}

/************************************************************************
Does an asynchronous write of a buffer page. NOTE: in simulated aio and
also when the doublewrite buffer is used, we must call
buf_flush_buffered_writes after we have posted a batch of writes! */
static
void
buf_flush_write_block_low(
@@ -222,15 +357,24 @@ buf_flush_write_block_low(
	mach_write_to_8(block->frame + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN,
						block->newest_modification);

	/* Write to the page the space id and page number */

	mach_write_to_4(block->frame + FIL_PAGE_SPACE, block->space);
	mach_write_to_4(block->frame + FIL_PAGE_OFFSET, block->offset);

	/* We overwrite the first 4 bytes of the end lsn field to store
	a page checksum */

	mach_write_to_4(block->frame + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN,
			buf_calc_page_checksum(block->frame));

	if (!trx_doublewrite) {
		fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
			FALSE, block->space, block->offset, 0, UNIV_PAGE_SIZE,
		 			(void*)block->frame, (void*)block);
	} else {
		buf_flush_post_to_doublewrite_buf(block);
	}
}

/************************************************************************
@@ -251,14 +395,14 @@ buf_flush_try_page(
	buf_block_t*	block;
	ibool		locked;
	
	ut_ad((flush_type == BUF_FLUSH_LRU) || (flush_type == BUF_FLUSH_LIST)
				|| (flush_type == BUF_FLUSH_SINGLE_PAGE));
	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST
				|| flush_type == BUF_FLUSH_SINGLE_PAGE);

	mutex_enter(&(buf_pool->mutex));

	block = buf_page_hash_get(space, offset);

	if ((flush_type == BUF_FLUSH_LIST)
	if (flush_type == BUF_FLUSH_LIST
	    && block && buf_flush_ready_for_flush(block, flush_type)) {
	
		block->io_fix = BUF_IO_WRITE;
@@ -286,7 +430,7 @@ buf_flush_try_page(
		mutex_exit(&(buf_pool->mutex));

		if (!locked) {
			os_aio_simulated_wake_handler_threads();
			buf_flush_buffered_writes();

			rw_lock_s_lock_gen(&(block->lock), BUF_IO_WRITE);
		}
@@ -300,7 +444,7 @@ buf_flush_try_page(
		
		return(1);

	} else if ((flush_type == BUF_FLUSH_LRU) && block
	} else if (flush_type == BUF_FLUSH_LRU && block
			&& buf_flush_ready_for_flush(block, flush_type)) {

		/* VERY IMPORTANT:
@@ -328,7 +472,7 @@ buf_flush_try_page(

		return(1);

	} else if ((flush_type == BUF_FLUSH_SINGLE_PAGE) && block
	} else if (flush_type == BUF_FLUSH_SINGLE_PAGE && block
			&& buf_flush_ready_for_flush(block, flush_type)) {
	
		block->io_fix = BUF_IO_WRITE;
@@ -385,6 +529,14 @@ buf_flush_try_neighbors(
		/* If there is little space, it is better not to flush any
		block except from the end of the LRU list */
	
		low = offset;
		high = offset + 1;
	} else if (flush_type == BUF_FLUSH_LIST) {
		/* Since semaphore waits require us to flush the
		doublewrite buffer to disk, it is best that the
		search area is just the page itself, to minimize
		chances for semaphore waits */

		low = offset;
		high = offset + 1;
	}		
@@ -418,13 +570,6 @@ buf_flush_try_neighbors(
				
	mutex_exit(&(buf_pool->mutex));

	/* In simulated aio we wake up the i/o-handler threads now that
	we have posted a batch of writes: */
	
	/*	printf("Flush count %lu ; Waking i/o handlers\n", count); */

	os_aio_simulated_wake_handler_threads();

	return(count);
}

@@ -565,13 +710,15 @@ buf_flush_batch(

	mutex_exit(&(buf_pool->mutex));

	if (buf_debug_prints && (page_count > 0)) {
	buf_flush_buffered_writes();

	if (buf_debug_prints && page_count > 0) {
		if (flush_type == BUF_FLUSH_LRU) {
			printf("To flush %lu pages in LRU flush\n",
			printf("Flushed %lu pages in LRU flush\n",
						page_count);
		} else if (flush_type == BUF_FLUSH_LIST) {
			printf("To flush %lu pages in flush list flush\n",
						page_count, flush_type);
			printf("Flushed %lu pages in flush list flush\n",
						page_count);
		} else {
			ut_error;
		}
+13 −1
Original line number Diff line number Diff line
@@ -49,7 +49,9 @@ ulint
buf_read_page_low(
/*==============*/
			/* out: 1 if a read request was queued, 0 if the page
			already resided in buf_pool */
			already resided in buf_pool or if the page is in
			the doublewrite buffer blocks in which case it is never
			read into the pool */
	ibool	sync,	/* in: TRUE if synchronous aio is desired */
	ulint	mode,	/* in: BUF_READ_IBUF_PAGES_ONLY, ...,
			ORed to OS_AIO_SIMULATED_WAKE_LATER (see below
@@ -63,6 +65,16 @@ buf_read_page_low(
	wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
	mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;
	
	if (trx_doublewrite && space == TRX_SYS_SPACE
		&& (   (offset >= trx_doublewrite->block1
		        && offset < trx_doublewrite->block1
		     		+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
		    || (offset >= trx_doublewrite->block2
		        && offset < trx_doublewrite->block2
		     		+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) {
		return(0);
	}

#ifdef UNIV_LOG_DEBUG
	if (space % 2 == 1) {
		/* We are updating a replicate space while holding the
Loading