From 607e806e9e31c4069d08e568ed0898e142cfb47b Mon Sep 17 00:00:00 2001 From: Max Reitz Date: Fri, 18 Sep 2015 14:19:09 +0200 Subject: [PATCH 02/10] qemu-img convert: Rewrite copying logic Message-id: <1442585950-6502-3-git-send-email-mreitz@redhat.com> Patchwork-id: 67832 O-Subject: [RHEV-7.2 qemu-kvm-rhev PATCH 2/3] qemu-img convert: Rewrite copying logic Bugzilla: 1256541 RH-Acked-by: Miroslav Rezanina RH-Acked-by: Paolo Bonzini RH-Acked-by: Kevin Wolf From: Kevin Wolf The implementation of qemu-img convert is (a) messy, (b) buggy, and (c) less efficient than possible. The changes required to beat some sense into it are massive enough that incremental changes would only make my and the reviewers' life harder. So throw it away and reimplement it from scratch. Let me give some examples what I mean by messy, buggy and inefficient: (a) The copying logic of qemu-img convert has two separate branches for compressed and normal target images, which roughly do the same - except for a little code that handles actual differences between compressed and uncompressed images, and much more code that implements just a different set of optimisations and bugs. This is unnecessary code duplication, and makes the code for compressed output (unsurprisingly) suffer from bitrot. The code for uncompressed ouput is run twice to count the the total length for the progress bar. In the first run it just takes a shortcut and runs only half the loop, and when it's done, it toggles a boolean, jumps out of the loop with a backwards goto and starts over. Works, but pretty is something different. (b) Converting while keeping a backing file (-B option) is broken in several ways. This includes not writing to the image file if the input has zero clusters or data filled with zeros (ignoring that the backing file will be visible instead). It also doesn't correctly limit every iteration of the copy loop to sectors of the same status so that too many sectors may be copied to in the target image. For -B this gives an unexpected result, for other images it just does more work than necessary. Conversion with a compressed target completely ignores any target backing file. (c) qemu-img convert skips reading and writing an area if it knows from metadata that copying isn't needed (except for the bug mentioned above that ignores a status change in some cases). It does, however, read from the source even if it knows that it will read zeros, and then search for non-zero bytes in the read buffer, if it's possible that a write might be needed. This reimplementation of the copying core reorganises the code to remove the duplication and have a much more obvious code flow, by essentially splitting the copy iteration loop into three parts: 1. Find the number of contiguous sectors of the same status at the current offset (This can also be called in a separate loop before the copying loop in order to determine the total sectors for the progress bar.) 2. Read sectors. If the status implies that there is no data there to read (zero or unallocated cluster), don't do anything. 3. Write sectors depending on the status. If it's data, write it. If we want the backing file to be visible (with -B), don't write it. If it's zeroed, skip it if you can, otherwise use bdrv_write_zeroes() to optimise the write at least where possible. Signed-off-by: Kevin Wolf Reviewed-by: Max Reitz (cherry picked from commit 690c7301600162421b928c7f26fd488fd8fa464e) Signed-off-by: Max Reitz Signed-off-by: Miroslav Rezanina --- qemu-img.c | 516 +++++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 310 insertions(+), 206 deletions(-) diff --git a/qemu-img.c b/qemu-img.c index ee38e0e..86ce1d0 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -1305,20 +1305,312 @@ out3: return ret; } +enum ImgConvertBlockStatus { + BLK_DATA, + BLK_ZERO, + BLK_BACKING_FILE, +}; + +typedef struct ImgConvertState { + BlockBackend **src; + int64_t *src_sectors; + int src_cur, src_num; + int64_t src_cur_offset; + int64_t total_sectors; + int64_t allocated_sectors; + enum ImgConvertBlockStatus status; + int64_t sector_next_status; + BlockBackend *target; + bool has_zero_init; + bool compressed; + bool target_has_backing; + int min_sparse; + size_t cluster_sectors; + size_t buf_sectors; +} ImgConvertState; + +static void convert_select_part(ImgConvertState *s, int64_t sector_num) +{ + assert(sector_num >= s->src_cur_offset); + while (sector_num - s->src_cur_offset >= s->src_sectors[s->src_cur]) { + s->src_cur_offset += s->src_sectors[s->src_cur]; + s->src_cur++; + assert(s->src_cur < s->src_num); + } +} + +static int convert_iteration_sectors(ImgConvertState *s, int64_t sector_num) +{ + int64_t ret; + int n; + + convert_select_part(s, sector_num); + + assert(s->total_sectors > sector_num); + n = MIN(s->total_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); + + if (s->sector_next_status <= sector_num) { + ret = bdrv_get_block_status(blk_bs(s->src[s->src_cur]), + sector_num - s->src_cur_offset, + n, &n); + if (ret < 0) { + return ret; + } + + if (ret & BDRV_BLOCK_ZERO) { + s->status = BLK_ZERO; + } else if (ret & BDRV_BLOCK_DATA) { + s->status = BLK_DATA; + } else if (!s->target_has_backing) { + /* Without a target backing file we must copy over the contents of + * the backing file as well. */ + /* TODO Check block status of the backing file chain to avoid + * needlessly reading zeroes and limiting the iteration to the + * buffer size */ + s->status = BLK_DATA; + } else { + s->status = BLK_BACKING_FILE; + } + + s->sector_next_status = sector_num + n; + } + + n = MIN(n, s->sector_next_status - sector_num); + if (s->status == BLK_DATA) { + n = MIN(n, s->buf_sectors); + } + + /* We need to write complete clusters for compressed images, so if an + * unallocated area is shorter than that, we must consider the whole + * cluster allocated. */ + if (s->compressed) { + if (n < s->cluster_sectors) { + n = MIN(s->cluster_sectors, s->total_sectors - sector_num); + s->status = BLK_DATA; + } else { + n = QEMU_ALIGN_DOWN(n, s->cluster_sectors); + } + } + + return n; +} + +static int convert_read(ImgConvertState *s, int64_t sector_num, int nb_sectors, + uint8_t *buf) +{ + int n; + int ret; + + if (s->status == BLK_ZERO || s->status == BLK_BACKING_FILE) { + return 0; + } + + assert(nb_sectors <= s->buf_sectors); + while (nb_sectors > 0) { + BlockBackend *blk; + int64_t bs_sectors; + + /* In the case of compression with multiple source files, we can get a + * nb_sectors that spreads into the next part. So we must be able to + * read across multiple BDSes for one convert_read() call. */ + convert_select_part(s, sector_num); + blk = s->src[s->src_cur]; + bs_sectors = s->src_sectors[s->src_cur]; + + n = MIN(nb_sectors, bs_sectors - (sector_num - s->src_cur_offset)); + ret = blk_read(blk, sector_num - s->src_cur_offset, buf, n); + if (ret < 0) { + return ret; + } + + sector_num += n; + nb_sectors -= n; + buf += n * BDRV_SECTOR_SIZE; + } + + return 0; +} + +static int convert_write(ImgConvertState *s, int64_t sector_num, int nb_sectors, + const uint8_t *buf) +{ + int ret; + + while (nb_sectors > 0) { + int n = nb_sectors; + + switch (s->status) { + case BLK_BACKING_FILE: + /* If we have a backing file, leave clusters unallocated that are + * unallocated in the source image, so that the backing file is + * visible at the respective offset. */ + assert(s->target_has_backing); + break; + + case BLK_DATA: + /* We must always write compressed clusters as a whole, so don't + * try to find zeroed parts in the buffer. We can only save the + * write if the buffer is completely zeroed and we're allowed to + * keep the target sparse. */ + if (s->compressed) { + if (s->has_zero_init && s->min_sparse && + buffer_is_zero(buf, n * BDRV_SECTOR_SIZE)) + { + assert(!s->target_has_backing); + break; + } + + ret = blk_write_compressed(s->target, sector_num, buf, n); + if (ret < 0) { + return ret; + } + break; + } + + /* If there is real non-zero data or we're told to keep the target + * fully allocated (-S 0), we must write it. Otherwise we can treat + * it as zero sectors. */ + if (!s->min_sparse || + is_allocated_sectors_min(buf, n, &n, s->min_sparse)) + { + ret = blk_write(s->target, sector_num, buf, n); + if (ret < 0) { + return ret; + } + break; + } + /* fall-through */ + + case BLK_ZERO: + if (s->has_zero_init) { + break; + } + ret = blk_write_zeroes(s->target, sector_num, n, 0); + if (ret < 0) { + return ret; + } + break; + } + + sector_num += n; + nb_sectors -= n; + buf += n * BDRV_SECTOR_SIZE; + } + + return 0; +} + +static int convert_do_copy(ImgConvertState *s) +{ + uint8_t *buf = NULL; + int64_t sector_num, allocated_done; + int ret; + int n; + + /* Check whether we have zero initialisation or can get it efficiently */ + s->has_zero_init = s->min_sparse && !s->target_has_backing + ? bdrv_has_zero_init(blk_bs(s->target)) + : false; + + if (!s->has_zero_init && !s->target_has_backing && + bdrv_can_write_zeroes_with_unmap(blk_bs(s->target))) + { + ret = bdrv_make_zero(blk_bs(s->target), BDRV_REQ_MAY_UNMAP); + if (ret == 0) { + s->has_zero_init = true; + } + } + + /* Allocate buffer for copied data. For compressed images, only one cluster + * can be copied at a time. */ + if (s->compressed) { + if (s->cluster_sectors <= 0 || s->cluster_sectors > s->buf_sectors) { + error_report("invalid cluster size"); + ret = -EINVAL; + goto fail; + } + s->buf_sectors = s->cluster_sectors; + } + buf = blk_blockalign(s->target, s->buf_sectors * BDRV_SECTOR_SIZE); + + /* Calculate allocated sectors for progress */ + s->allocated_sectors = 0; + sector_num = 0; + while (sector_num < s->total_sectors) { + n = convert_iteration_sectors(s, sector_num); + if (n < 0) { + ret = n; + goto fail; + } + if (s->status == BLK_DATA) { + s->allocated_sectors += n; + } + sector_num += n; + } + + /* Do the copy */ + s->src_cur = 0; + s->src_cur_offset = 0; + s->sector_next_status = 0; + + sector_num = 0; + allocated_done = 0; + + while (sector_num < s->total_sectors) { + n = convert_iteration_sectors(s, sector_num); + if (n < 0) { + ret = n; + goto fail; + } + if (s->status == BLK_DATA) { + allocated_done += n; + qemu_progress_print(100.0 * allocated_done / s->allocated_sectors, + 0); + } + + ret = convert_read(s, sector_num, n, buf); + if (ret < 0) { + error_report("error while reading sector %" PRId64 + ": %s", sector_num, strerror(-ret)); + goto fail; + } + + ret = convert_write(s, sector_num, n, buf); + if (ret < 0) { + error_report("error while writing sector %" PRId64 + ": %s", sector_num, strerror(-ret)); + goto fail; + } + + sector_num += n; + } + + if (s->compressed) { + /* signal EOF to align */ + ret = blk_write_compressed(s->target, 0, NULL, 0); + if (ret < 0) { + goto fail; + } + } + + ret = 0; +fail: + qemu_vfree(buf); + return ret; +} + static int img_convert(int argc, char **argv) { - int c, n, n1, bs_n, bs_i, compress, cluster_sectors, skip_create; + int c, bs_n, bs_i, compress, cluster_sectors, skip_create; int64_t ret = 0; int progress = 0, flags, src_flags; const char *fmt, *out_fmt, *cache, *src_cache, *out_baseimg, *out_filename; BlockDriver *drv, *proto_drv; BlockBackend **blk = NULL, *out_blk = NULL; BlockDriverState **bs = NULL, *out_bs = NULL; - int64_t total_sectors, nb_sectors, sector_num, bs_offset; + int64_t total_sectors; int64_t *bs_sectors = NULL; - uint8_t * buf = NULL; size_t bufsectors = IO_BUF_SIZE / BDRV_SECTOR_SIZE; - const uint8_t *buf1; BlockDriverInfo bdi; QemuOpts *opts = NULL; QemuOptsList *create_opts = NULL; @@ -1329,6 +1621,7 @@ static int img_convert(int argc, char **argv) bool quiet = false; Error *local_err = NULL; QemuOpts *sn_opts = NULL; + ImgConvertState state; fmt = NULL; out_fmt = "raw"; @@ -1627,9 +1920,6 @@ static int img_convert(int argc, char **argv) } out_bs = blk_bs(out_blk); - bs_i = 0; - bs_offset = 0; - /* increase bufsectors from the default 4096 (2M) if opt_transfer_length * or discard_alignment of the out_bs is greater. Limit to 32768 (16MB) * as maximum. */ @@ -1638,8 +1928,6 @@ static int img_convert(int argc, char **argv) out_bs->bl.discard_alignment)) ); - buf = blk_blockalign(out_blk, bufsectors * BDRV_SECTOR_SIZE); - if (skip_create) { int64_t output_sectors = blk_nb_sectors(out_blk); if (output_sectors < 0) { @@ -1666,203 +1954,20 @@ static int img_convert(int argc, char **argv) cluster_sectors = bdi.cluster_size / BDRV_SECTOR_SIZE; } - if (compress) { - if (cluster_sectors <= 0 || cluster_sectors > bufsectors) { - error_report("invalid cluster size"); - ret = -1; - goto out; - } - sector_num = 0; - - nb_sectors = total_sectors; - - for(;;) { - int64_t bs_num; - int remainder; - uint8_t *buf2; - - nb_sectors = total_sectors - sector_num; - if (nb_sectors <= 0) - break; - if (nb_sectors >= cluster_sectors) - n = cluster_sectors; - else - n = nb_sectors; - - bs_num = sector_num - bs_offset; - assert (bs_num >= 0); - remainder = n; - buf2 = buf; - while (remainder > 0) { - int nlow; - while (bs_num == bs_sectors[bs_i]) { - bs_offset += bs_sectors[bs_i]; - bs_i++; - assert (bs_i < bs_n); - bs_num = 0; - /* printf("changing part: sector_num=%" PRId64 ", " - "bs_i=%d, bs_offset=%" PRId64 ", bs_sectors=%" PRId64 - "\n", sector_num, bs_i, bs_offset, bs_sectors[bs_i]); */ - } - assert (bs_num < bs_sectors[bs_i]); - - nlow = remainder > bs_sectors[bs_i] - bs_num - ? bs_sectors[bs_i] - bs_num : remainder; - - ret = blk_read(blk[bs_i], bs_num, buf2, nlow); - if (ret < 0) { - error_report("error while reading sector %" PRId64 ": %s", - bs_num, strerror(-ret)); - goto out; - } - - buf2 += nlow * 512; - bs_num += nlow; - - remainder -= nlow; - } - assert (remainder == 0); - - if (!buffer_is_zero(buf, n * BDRV_SECTOR_SIZE)) { - ret = blk_write_compressed(out_blk, sector_num, buf, n); - if (ret != 0) { - error_report("error while compressing sector %" PRId64 - ": %s", sector_num, strerror(-ret)); - goto out; - } - } - sector_num += n; - qemu_progress_print(100.0 * sector_num / total_sectors, 0); - } - /* signal EOF to align */ - blk_write_compressed(out_blk, 0, NULL, 0); - } else { - int64_t sectors_to_read, sectors_read, sector_num_next_status; - bool count_allocated_sectors; - int has_zero_init = min_sparse ? bdrv_has_zero_init(out_bs) : 0; - - if (!has_zero_init && bdrv_can_write_zeroes_with_unmap(out_bs)) { - ret = bdrv_make_zero(out_bs, BDRV_REQ_MAY_UNMAP); - if (ret < 0) { - goto out; - } - has_zero_init = 1; - } - - sectors_to_read = total_sectors; - count_allocated_sectors = progress && (out_baseimg || has_zero_init); -restart: - sector_num = 0; // total number of sectors converted so far - sectors_read = 0; - sector_num_next_status = 0; - - for(;;) { - nb_sectors = total_sectors - sector_num; - if (nb_sectors <= 0) { - if (count_allocated_sectors) { - sectors_to_read = sectors_read; - count_allocated_sectors = false; - goto restart; - } - ret = 0; - break; - } - - while (sector_num - bs_offset >= bs_sectors[bs_i]) { - bs_offset += bs_sectors[bs_i]; - bs_i ++; - assert (bs_i < bs_n); - /* printf("changing part: sector_num=%" PRId64 ", bs_i=%d, " - "bs_offset=%" PRId64 ", bs_sectors=%" PRId64 "\n", - sector_num, bs_i, bs_offset, bs_sectors[bs_i]); */ - } - - if ((out_baseimg || has_zero_init) && - sector_num >= sector_num_next_status) { - n = nb_sectors > INT_MAX ? INT_MAX : nb_sectors; - ret = bdrv_get_block_status(bs[bs_i], sector_num - bs_offset, - n, &n1); - if (ret < 0) { - error_report("error while reading block status of sector %" - PRId64 ": %s", sector_num - bs_offset, - strerror(-ret)); - goto out; - } - /* If the output image is zero initialized, we are not working - * on a shared base and the input is zero we can skip the next - * n1 sectors */ - if (has_zero_init && !out_baseimg && (ret & BDRV_BLOCK_ZERO)) { - sector_num += n1; - continue; - } - /* If the output image is being created as a copy on write - * image, assume that sectors which are unallocated in the - * input image are present in both the output's and input's - * base images (no need to copy them). */ - if (out_baseimg) { - if (!(ret & BDRV_BLOCK_DATA)) { - sector_num += n1; - continue; - } - /* The next 'n1' sectors are allocated in the input image. - * Copy only those as they may be followed by unallocated - * sectors. */ - nb_sectors = n1; - } - /* avoid redundant callouts to get_block_status */ - sector_num_next_status = sector_num + n1; - } - - n = MIN(nb_sectors, bufsectors); - - /* round down request length to an aligned sector, but - * do not bother doing this on short requests. They happen - * when we found an all-zero area, and the next sector to - * write will not be sector_num + n. */ - if (cluster_sectors > 0 && n >= cluster_sectors) { - int64_t next_aligned_sector = (sector_num + n); - next_aligned_sector -= next_aligned_sector % cluster_sectors; - if (sector_num + n > next_aligned_sector) { - n = next_aligned_sector - sector_num; - } - } - - n = MIN(n, bs_sectors[bs_i] - (sector_num - bs_offset)); - - sectors_read += n; - if (count_allocated_sectors) { - sector_num += n; - continue; - } + state = (ImgConvertState) { + .src = blk, + .src_sectors = bs_sectors, + .src_num = bs_n, + .total_sectors = total_sectors, + .target = out_blk, + .compressed = compress, + .target_has_backing = (bool) out_baseimg, + .min_sparse = min_sparse, + .cluster_sectors = cluster_sectors, + .buf_sectors = bufsectors, + }; + ret = convert_do_copy(&state); - n1 = n; - ret = blk_read(blk[bs_i], sector_num - bs_offset, buf, n); - if (ret < 0) { - error_report("error while reading sector %" PRId64 ": %s", - sector_num - bs_offset, strerror(-ret)); - goto out; - } - /* NOTE: at the same time we convert, we do not write zero - sectors to have a chance to compress the image. Ideally, we - should add a specific call to have the info to go faster */ - buf1 = buf; - while (n > 0) { - if (!has_zero_init || - is_allocated_sectors_min(buf1, n, &n1, min_sparse)) { - ret = blk_write(out_blk, sector_num, buf1, n1); - if (ret < 0) { - error_report("error while writing sector %" PRId64 - ": %s", sector_num, strerror(-ret)); - goto out; - } - } - sector_num += n1; - n -= n1; - buf1 += n1 * 512; - } - qemu_progress_print(100.0 * sectors_read / sectors_to_read, 0); - } - } out: if (!ret) { qemu_progress_print(100, 0); @@ -1870,7 +1975,6 @@ out: qemu_progress_end(); qemu_opts_del(opts); qemu_opts_free(create_opts); - qemu_vfree(buf); qemu_opts_del(sn_opts); blk_unref(out_blk); g_free(bs); -- 1.8.3.1