From cdd089a8c5b95ad131e950ce25ea23c06262f374 Mon Sep 17 00:00:00 2001 From: Jim Harris Date: Fri, 24 May 2019 10:31:17 -0700 Subject: [PATCH] blobfs: don't flush a partial buffer with no sync pending We flush a cache buffer once it's filled. When the write for that cache buffer has completed, we look to see if there's more data to flush. Currently if there's *any* more data to flush, we will flush it, even if it's not a full buffer. That can hurt performance though. Ideally we only want to flush a partial buffer if there's been an explicit sync operation that requires that partial buffer to be flushed. Otherwise we will end up writing the partial buffer to disk, and then come back and write that data again later when the buffer is full. Add a new unit test to test for this condition. This patch breaks one of the existing unit tests which was designed specifically around a RocksDB failure condition. Change that file_length unit test to now write exactly one CACHE_BUFFER, which still tests the general logic making sure that we don't confuse the amount of data flushed with the value written to the file's length xattr. Signed-off-by: Jim Harris Change-Id: I83795fb45afe854b38648d0e0c1a7928219307a2 Reviewed-on: https://review.gerrithub.io/c/spdk/spdk/+/455698 Tested-by: SPDK CI Jenkins Reviewed-by: Ben Walker Reviewed-by: Ziye Yang Reviewed-by: Changpeng Liu --- lib/blobfs/blobfs.c | 12 ++- .../blobfs/blobfs_sync_ut/blobfs_sync_ut.c | 76 +++++++++++++++++-- 2 files changed, 78 insertions(+), 10 deletions(-) diff --git a/lib/blobfs/blobfs.c b/lib/blobfs/blobfs.c index fa51e77c7..c83a800b4 100644 --- a/lib/blobfs/blobfs.c +++ b/lib/blobfs/blobfs.c @@ -2126,11 +2126,15 @@ __file_flush(void *ctx) pthread_spin_lock(&file->lock); next = spdk_tree_find_buffer(file->tree, file->length_flushed); - if (next == NULL || next->in_progress) { + if (next == NULL || next->in_progress || + ((next->bytes_filled < next->buf_size) && TAILQ_EMPTY(&file->sync_requests))) { /* - * There is either no data to flush, or a flush I/O is already in - * progress. So return immediately - if a flush I/O is in - * progress we will flush more data after that is completed. + * There is either no data to flush, a flush I/O is already in + * progress, or the next buffer is partially filled but there's no + * outstanding request to sync it. + * So return immediately - if a flush I/O is in progress we will flush + * more data after that is completed, or a partial buffer will get flushed + * when it is either filled or the file is synced. */ free_fs_request(req); if (next == NULL) { diff --git a/test/unit/lib/blobfs/blobfs_sync_ut/blobfs_sync_ut.c b/test/unit/lib/blobfs/blobfs_sync_ut/blobfs_sync_ut.c index f7ef8c59c..e70ea119a 100644 --- a/test/unit/lib/blobfs/blobfs_sync_ut/blobfs_sync_ut.c +++ b/test/unit/lib/blobfs/blobfs_sync_ut/blobfs_sync_ut.c @@ -171,6 +171,11 @@ _fs_unload(void *arg) g_fs = NULL; } +static void +_nop(void *arg) +{ +} + static void cache_read_after_write(void) { @@ -244,13 +249,10 @@ file_length(void) CU_ASSERT(rc == 0); SPDK_CU_ASSERT_FATAL(g_file != NULL); - /* Write slightly more than one CACHE_BUFFER. Filling at least one cache - * buffer triggers a flush to disk. Currently when that cache buffer is written, - * it will proceed to write the final byte, even though there's been no explicit - * sync for it yet. We will fix that eventually, but for now test with this - * behavior since it matches a subtle RocksDB failure scenario. + /* Write one CACHE_BUFFER. Filling at least one cache buffer triggers + * a flush to disk. */ - buf_length = CACHE_BUFFER_SIZE + 1; + buf_length = CACHE_BUFFER_SIZE; buf = calloc(1, buf_length); spdk_file_write(g_file, channel, buf, 0, buf_length); free(buf); @@ -301,6 +303,67 @@ file_length(void) ut_send_request(_fs_unload, NULL); } +static void +partial_buffer(void) +{ + int rc; + char *buf; + uint64_t buf_length; + struct spdk_fs_thread_ctx *channel; + struct spdk_file_stat stat = {0}; + + ut_send_request(_fs_init, NULL); + + channel = spdk_fs_alloc_thread_ctx(g_fs); + + g_file = NULL; + rc = spdk_fs_open_file(g_fs, channel, "testfile", SPDK_BLOBFS_OPEN_CREATE, &g_file); + CU_ASSERT(rc == 0); + SPDK_CU_ASSERT_FATAL(g_file != NULL); + + /* Write one CACHE_BUFFER plus one byte. Filling at least one cache buffer triggers + * a flush to disk. We want to make sure the extra byte is not implicitly flushed. + * It should only get flushed once we sync or close the file. + */ + buf_length = CACHE_BUFFER_SIZE + 1; + buf = calloc(1, buf_length); + spdk_file_write(g_file, channel, buf, 0, buf_length); + free(buf); + + /* Send some nop messages to the dispatch thread. This will ensure any of the + * pending write operations are completed. A well-functioning blobfs should only + * issue one write for the filled CACHE_BUFFER - a buggy one might try to write + * the extra byte. So do a bunch of _nops to make sure all of them (even the buggy + * ones) get a chance to run. Note that we can't just send a message to the + * dispatch thread to call spdk_thread_poll() because the messages are themselves + * run in the context of spdk_thread_poll(). + */ + ut_send_request(_nop, NULL); + ut_send_request(_nop, NULL); + ut_send_request(_nop, NULL); + ut_send_request(_nop, NULL); + ut_send_request(_nop, NULL); + ut_send_request(_nop, NULL); + + CU_ASSERT(g_file->length_flushed == CACHE_BUFFER_SIZE); + + /* Close the file. This causes an implicit sync which should write the + * length_flushed value as the "length" xattr on the file. + */ + spdk_file_close(g_file, channel); + + rc = spdk_fs_file_stat(g_fs, channel, "testfile", &stat); + CU_ASSERT(rc == 0); + CU_ASSERT(buf_length == stat.size); + + rc = spdk_fs_delete_file(g_fs, channel, "testfile"); + CU_ASSERT(rc == 0); + + spdk_fs_free_thread_ctx(channel); + + ut_send_request(_fs_unload, NULL); +} + static void cache_write_null_buffer(void) { @@ -518,6 +581,7 @@ int main(int argc, char **argv) if ( CU_add_test(suite, "cache read after write", cache_read_after_write) == NULL || CU_add_test(suite, "file length", file_length) == NULL || + CU_add_test(suite, "partial buffer", partial_buffer) == NULL || CU_add_test(suite, "write_null_buffer", cache_write_null_buffer) == NULL || CU_add_test(suite, "create_sync", fs_create_sync) == NULL || CU_add_test(suite, "rename_sync", fs_rename_sync) == NULL ||