diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index b59ca4c8fbdf9536ca7932dcce2c87424d96a00b..97fc8034a2718267d8ae3fa8efa1be0bb57edeb8 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -21,6 +21,59 @@ Description: device is offset from the internal allocation unit's natural alignment. +What: /sys/block//atomic_write_max_bytes +Date: February 2024 +Contact: Himanshu Madhani +Description: + [RO] This parameter specifies the maximum atomic write + size reported by the device. This parameter is relevant + for merging of writes, where a merged atomic write + operation must not exceed this number of bytes. + This parameter may be greater than the value in + atomic_write_unit_max_bytes as + atomic_write_unit_max_bytes will be rounded down to a + power-of-two and atomic_write_unit_max_bytes may also be + limited by some other queue limits, such as max_segments. + This parameter - along with atomic_write_unit_min_bytes + and atomic_write_unit_max_bytes - will not be larger than + max_hw_sectors_kb, but may be larger than max_sectors_kb. + + +What: /sys/block//atomic_write_unit_min_bytes +Date: February 2024 +Contact: Himanshu Madhani +Description: + [RO] This parameter specifies the smallest block which can + be written atomically with an atomic write operation. All + atomic write operations must begin at a + atomic_write_unit_min boundary and must be multiples of + atomic_write_unit_min. This value must be a power-of-two. + + +What: /sys/block//atomic_write_unit_max_bytes +Date: February 2024 +Contact: Himanshu Madhani +Description: + [RO] This parameter defines the largest block which can be + written atomically with an atomic write operation. This + value must be a multiple of atomic_write_unit_min and must + be a power-of-two. This value will not be larger than + atomic_write_max_bytes. + + +What: /sys/block//atomic_write_boundary_bytes +Date: February 2024 +Contact: Himanshu Madhani +Description: + [RO] A device may need to internally split an atomic write I/O + which straddles a given logical block address boundary. This + parameter specifies the size in bytes of the atomic boundary if + one is reported by the device. This value must be a + power-of-two and at least the size as in + atomic_write_unit_max_bytes. + Any attempt to merge atomic write I/Os must not result in a + merged I/O which crosses this boundary (if any). + What: /sys/block//diskseq Date: February 2021 diff --git a/block/blk-core.c b/block/blk-core.c index 80ea58299ef63f9ba0a23705270583b583de0cf1..a1e87fc05162802a26084c34acc5f7f93acb91be 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -201,6 +201,8 @@ static const struct { /* Command duration limit device-side timeout */ [BLK_STS_DURATION_LIMIT] = { -ETIME, "duration limit exceeded" }, + [BLK_STS_INVAL] = { -EINVAL, "invalid" }, + /* everything else not covered above: */ [BLK_STS_IOERR] = { -EIO, "I/O" }, }; @@ -749,6 +751,18 @@ void submit_bio_noacct_nocheck(struct bio *bio) __submit_bio_noacct(bio); } +static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q, + struct bio *bio) +{ + if (bio->bi_iter.bi_size > queue_atomic_write_unit_max_bytes(q)) + return BLK_STS_INVAL; + + if (bio->bi_iter.bi_size % queue_atomic_write_unit_min_bytes(q)) + return BLK_STS_INVAL; + + return BLK_STS_OK; +} + /** * submit_bio_noacct - re-submit a bio to the block device layer for I/O * @bio: The bio describing the location in memory and on the device. @@ -804,6 +818,14 @@ void submit_bio_noacct(struct bio *bio) bio_clear_polled(bio); switch (bio_op(bio)) { + case REQ_OP_READ: + case REQ_OP_WRITE: + if (bio->bi_opf & REQ_ATOMIC) { + status = blk_validate_atomic_write_op_size(q, bio); + if (status != BLK_STS_OK) + goto end_io; + } + break; case REQ_OP_DISCARD: if (!bdev_max_discard_sectors(bdev)) goto not_supported; diff --git a/block/blk-merge.c b/block/blk-merge.c index 5e753ba8ef5d76faf3df084d00703be027a80606..c9bd534cfe3f2a6a8ddb8ea3a96f158a82df4f91 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -158,6 +158,19 @@ static struct bio *bio_split_write_zeroes(struct bio *bio, return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs); } +static inline unsigned int blk_boundary_sectors(const struct queue_limits *lim, + bool is_atomic) +{ + /* + * chunk_sectors must be a multiple of atomic_write_boundary_sectors if + * both non-zero. + */ + if (is_atomic && lim->atomic_write_boundary_sectors) + return lim->atomic_write_boundary_sectors; + + return lim->chunk_sectors; +} + /* * Return the maximum number of sectors from the start of a bio that may be * submitted as a single request to a block device. If enough sectors remain, @@ -171,12 +184,23 @@ static inline unsigned get_max_io_size(struct bio *bio, { unsigned pbs = lim->physical_block_size >> SECTOR_SHIFT; unsigned lbs = lim->logical_block_size >> SECTOR_SHIFT; - unsigned max_sectors = lim->max_sectors, start, end; + bool is_atomic = bio->bi_opf & REQ_ATOMIC; + unsigned boundary_sectors = blk_boundary_sectors(lim, is_atomic); + unsigned max_sectors, start, end; - if (lim->chunk_sectors) { + /* + * We ignore lim->max_sectors for atomic writes because it may less + * than the actual bio size, which we cannot tolerate. + */ + if (is_atomic) + max_sectors = lim->atomic_write_max_sectors; + else + max_sectors = lim->max_sectors; + + if (boundary_sectors) { max_sectors = min(max_sectors, - blk_chunk_sectors_left(bio->bi_iter.bi_sector, - lim->chunk_sectors)); + blk_boundary_sectors_left(bio->bi_iter.bi_sector, + boundary_sectors)); } start = bio->bi_iter.bi_sector & (pbs - 1); @@ -317,6 +341,11 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, *segs = nsegs; return NULL; split: + if (bio->bi_opf & REQ_ATOMIC) { + bio->bi_status = BLK_STS_INVAL; + bio_endio(bio); + return ERR_PTR(-EINVAL); + } /* * We can't sanely support splitting for a REQ_NOWAIT bio. End it * with EAGAIN if splitting is required and return an error pointer. @@ -599,18 +628,22 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq, sector_t offset) { struct request_queue *q = rq->q; - unsigned int max_sectors; + struct queue_limits *lim = &q->limits; + unsigned int max_sectors, boundary_sectors; + bool is_atomic = rq->cmd_flags & REQ_ATOMIC; if (blk_rq_is_passthrough(rq)) return q->limits.max_hw_sectors; - max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); - if (!q->limits.chunk_sectors || + boundary_sectors = blk_boundary_sectors(lim, is_atomic); + max_sectors = blk_queue_get_max_sectors(rq); + + if (!boundary_sectors || req_op(rq) == REQ_OP_DISCARD || req_op(rq) == REQ_OP_SECURE_ERASE) return max_sectors; return min(max_sectors, - blk_chunk_sectors_left(offset, q->limits.chunk_sectors)); + blk_boundary_sectors_left(offset, boundary_sectors)); } static inline int ll_new_hw_segment(struct request *req, struct bio *bio, @@ -809,6 +842,18 @@ static enum elv_merge blk_try_req_merge(struct request *req, return ELEVATOR_NO_MERGE; } +static bool blk_atomic_write_mergeable_rq_bio(struct request *rq, + struct bio *bio) +{ + return (rq->cmd_flags & REQ_ATOMIC) == (bio->bi_opf & REQ_ATOMIC); +} + +static bool blk_atomic_write_mergeable_rqs(struct request *rq, + struct request *next) +{ + return (rq->cmd_flags & REQ_ATOMIC) == (next->cmd_flags & REQ_ATOMIC); +} + /* * For non-mq, this has to be called with the request spinlock acquired. * For mq with scheduling, the appropriate queue wide lock should be held. @@ -828,6 +873,9 @@ static struct request *attempt_merge(struct request_queue *q, if (req->ioprio != next->ioprio) return NULL; + if (!blk_atomic_write_mergeable_rqs(req, next)) + return NULL; + /* * If we are allowed to merge, then append bio list * from next to rq and release next. merge_requests_fn @@ -956,6 +1004,9 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (rq->ioprio != bio_prio(bio)) return false; + if (blk_atomic_write_mergeable_rq_bio(rq, bio) == false) + return false; + return true; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 9281c00aa8a9481ca8132a912c7d1e8885d60f0e..7f3ecd37dadb9e1cfb3506bf02c0de0366da5d5f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3151,7 +3151,7 @@ void blk_mq_submit_bio(struct bio *bio) blk_status_t blk_insert_cloned_request(struct request *rq) { struct request_queue *q = rq->q; - unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); + unsigned int max_sectors = blk_queue_get_max_sectors(rq); unsigned int max_segments = blk_rq_get_max_segments(rq); blk_status_t ret; diff --git a/block/blk-settings.c b/block/blk-settings.c index 021994f6d2d82953e6bf0ca3e2ed2893263d3592..1a2b288e20dfef00e3d1fab9818f28bb088d6bf2 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -25,6 +25,92 @@ void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) } EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); +/* + * Returns max guaranteed bytes which we can fit in a bio. + * + * We request that an atomic_write is ITER_UBUF iov_iter (so a single vector), + * so we assume that we can fit in at least PAGE_SIZE in a segment, apart from + * the first and last segments. + */ +static +unsigned int blk_queue_max_guaranteed_bio(struct queue_limits *lim) +{ + unsigned int max_segments = min(BIO_MAX_VECS, lim->max_segments); + unsigned int length; + + length = min(max_segments, 2) * lim->logical_block_size; + if (max_segments > 2) + length += (max_segments - 2) * PAGE_SIZE; + + return length; +} + +static void blk_atomic_writes_update_limits(struct queue_limits *lim) +{ + unsigned int unit_limit = min(lim->max_hw_sectors << SECTOR_SHIFT, + blk_queue_max_guaranteed_bio(lim)); + + unit_limit = rounddown_pow_of_two(unit_limit); + + lim->atomic_write_max_sectors = + min(lim->atomic_write_hw_max >> SECTOR_SHIFT, + lim->max_hw_sectors); + lim->atomic_write_unit_min = + min(lim->atomic_write_hw_unit_min, unit_limit); + lim->atomic_write_unit_max = + min(lim->atomic_write_hw_unit_max, unit_limit); + lim->atomic_write_boundary_sectors = + lim->atomic_write_hw_boundary >> SECTOR_SHIFT; +} + +static void blk_validate_atomic_write_limits(struct queue_limits *lim) +{ + unsigned int chunk_sectors = lim->chunk_sectors; + unsigned int boundary_sectors; + + if (!lim->atomic_write_hw_max) + goto unsupported; + + boundary_sectors = lim->atomic_write_hw_boundary >> SECTOR_SHIFT; + + if (boundary_sectors) { + /* + * A feature of boundary support is that it disallows bios to + * be merged which would result in a merged request which + * crosses either a chunk sector or atomic write HW boundary, + * even though chunk sectors may be just set for performance. + * For simplicity, disallow atomic writes for a chunk sector + * which is non-zero and smaller than atomic write HW boundary. + * Furthermore, chunk sectors must be a multiple of atomic + * write HW boundary. Otherwise boundary support becomes + * complicated. + * Devices which do not conform to these rules can be dealt + * with if and when they show up. + */ + if (WARN_ON_ONCE(do_div(chunk_sectors, boundary_sectors))) + goto unsupported; + + /* + * The boundary size just needs to be a multiple of unit_max + * (and not necessarily a power-of-2), so this following check + * could be relaxed in future. + * Furthermore, if needed, unit_max could even be reduced so + * that it is compliant with a !power-of-2 boundary. + */ + if (!is_power_of_2(boundary_sectors)) + goto unsupported; + } + + blk_atomic_writes_update_limits(lim); + return; + +unsupported: + lim->atomic_write_max_sectors = 0; + lim->atomic_write_boundary_sectors = 0; + lim->atomic_write_unit_min = 0; + lim->atomic_write_unit_max = 0; +} + /** * blk_set_default_limits - reset limits to default values * @lim: the queue_limits structure to reset @@ -59,6 +145,8 @@ void blk_set_default_limits(struct queue_limits *lim) lim->zoned = BLK_ZONED_NONE; lim->zone_write_granularity = 0; lim->dma_alignment = 511; + + blk_validate_atomic_write_limits(lim); } /** diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index d3aa7a6ff1003be808b68435938e7f352ef522c6..40211468355ecdbfb4148d75e1a274cea6fca118 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -119,6 +119,30 @@ static ssize_t queue_max_discard_segments_show(struct request_queue *q, return queue_var_show(queue_max_discard_segments(q), page); } +static ssize_t queue_atomic_write_max_bytes_show(struct request_queue *q, + char *page) +{ + return queue_var_show(queue_atomic_write_max_bytes(q), page); +} + +static ssize_t queue_atomic_write_boundary_show(struct request_queue *q, + char *page) +{ + return queue_var_show(queue_atomic_write_boundary_bytes(q), page); +} + +static ssize_t queue_atomic_write_unit_min_show(struct request_queue *q, + char *page) +{ + return queue_var_show(queue_atomic_write_unit_min_bytes(q), page); +} + +static ssize_t queue_atomic_write_unit_max_show(struct request_queue *q, + char *page) +{ + return queue_var_show(queue_atomic_write_unit_max_bytes(q), page); +} + static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page) { return queue_var_show(q->limits.max_integrity_segments, page); @@ -547,6 +571,11 @@ QUEUE_RO_ENTRY(queue_discard_max_hw, "discard_max_hw_bytes"); QUEUE_RW_ENTRY(queue_discard_max, "discard_max_bytes"); QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); +QUEUE_RO_ENTRY(queue_atomic_write_max_bytes, "atomic_write_max_bytes"); +QUEUE_RO_ENTRY(queue_atomic_write_boundary, "atomic_write_boundary_bytes"); +QUEUE_RO_ENTRY(queue_atomic_write_unit_max, "atomic_write_unit_max_bytes"); +QUEUE_RO_ENTRY(queue_atomic_write_unit_min, "atomic_write_unit_min_bytes"); + QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes"); QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes"); @@ -672,6 +701,10 @@ static struct attribute *queue_attrs[] = { &queue_discard_max_entry.attr, &queue_discard_max_hw_entry.attr, &queue_discard_zeroes_data_entry.attr, + &queue_atomic_write_max_bytes_entry.attr, + &queue_atomic_write_boundary_entry.attr, + &queue_atomic_write_unit_min_entry.attr, + &queue_atomic_write_unit_max_entry.attr, &queue_write_same_max_entry.attr, &queue_write_zeroes_max_entry.attr, &queue_zone_append_max_entry.attr, diff --git a/block/blk.h b/block/blk.h index 0c469a84c9e050c413ee0bfd480033f62e356a62..1e157e11a187f7c841dbd44ea7fd67335dbf9fdd 100644 --- a/block/blk.h +++ b/block/blk.h @@ -168,9 +168,11 @@ static inline unsigned int blk_rq_get_max_segments(struct request *rq) return queue_max_segments(rq->q); } -static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, - enum req_op op) +static inline unsigned int blk_queue_get_max_sectors(struct request *rq) { + struct request_queue *q = rq->q; + enum req_op op = req_op(rq); + if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)) return min(q->limits.max_discard_sectors, UINT_MAX >> SECTOR_SHIFT); @@ -178,6 +180,9 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, if (unlikely(op == REQ_OP_WRITE_ZEROES)) return q->limits.max_write_zeroes_sectors; + if (rq->cmd_flags & REQ_ATOMIC) + return q->limits.atomic_write_max_sectors; + return q->limits.max_sectors; } diff --git a/block/fops.c b/block/fops.c index cda9978ebf67436cd70bb7eb5e04064ad18236e1..f65788081f836612c5179383e6624aadd97e5662 100644 --- a/block/fops.c +++ b/block/fops.c @@ -34,28 +34,25 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb) return opf; } -static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos, - struct iov_iter *iter) +static bool blkdev_dio_invalid(struct block_device *bdev, struct kiocb *iocb, + struct iov_iter *iter) { - return pos & (bdev_logical_block_size(bdev) - 1) || + return iocb->ki_pos & (bdev_logical_block_size(bdev) - 1) || !bdev_iter_is_aligned(bdev, iter); } #define DIO_INLINE_BIO_VECS 4 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, - struct iov_iter *iter, unsigned int nr_pages) + struct iov_iter *iter, struct block_device *bdev, + unsigned int nr_pages) { - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; loff_t pos = iocb->ki_pos; bool should_dirty = false; struct bio bio; ssize_t ret; - if (blkdev_dio_unaligned(bdev, pos, iter)) - return -EINVAL; - if (nr_pages <= DIO_INLINE_BIO_VECS) vecs = inline_vecs; else { @@ -74,6 +71,8 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, } bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio.bi_ioprio = iocb->ki_ioprio; + if (iocb->ki_flags & IOCB_ATOMIC) + bio.bi_opf |= REQ_ATOMIC; ret = bio_iov_iter_get_pages(&bio, iter); if (unlikely(ret)) @@ -160,9 +159,8 @@ static void blkdev_bio_end_io(struct bio *bio) } static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - unsigned int nr_pages) + struct block_device *bdev, unsigned int nr_pages) { - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); struct blk_plug plug; struct blkdev_dio *dio; struct bio *bio; @@ -171,9 +169,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos = iocb->ki_pos; int ret = 0; - if (blkdev_dio_unaligned(bdev, pos, iter)) - return -EINVAL; - if (iocb->ki_flags & IOCB_ALLOC_CACHE) opf |= REQ_ALLOC_CACHE; bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, @@ -300,9 +295,9 @@ static void blkdev_bio_end_io_async(struct bio *bio) static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, struct iov_iter *iter, + struct block_device *bdev, unsigned int nr_pages) { - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); bool is_read = iov_iter_rw(iter) == READ; blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); struct blkdev_dio *dio; @@ -310,9 +305,6 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, loff_t pos = iocb->ki_pos; int ret = 0; - if (blkdev_dio_unaligned(bdev, pos, iter)) - return -EINVAL; - if (iocb->ki_flags & IOCB_ALLOC_CACHE) opf |= REQ_ALLOC_CACHE; bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, @@ -350,6 +342,9 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, task_io_account_write(bio->bi_iter.bi_size); } + if (iocb->ki_flags & IOCB_ATOMIC) + bio->bi_opf |= REQ_ATOMIC; + if (iocb->ki_flags & IOCB_NOWAIT) bio->bi_opf |= REQ_NOWAIT; @@ -365,18 +360,25 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { + struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); unsigned int nr_pages; if (!iov_iter_count(iter)) return 0; + if (blkdev_dio_invalid(bdev, iocb, iter)) + return -EINVAL; + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); if (likely(nr_pages <= BIO_MAX_VECS)) { if (is_sync_kiocb(iocb)) - return __blkdev_direct_IO_simple(iocb, iter, nr_pages); - return __blkdev_direct_IO_async(iocb, iter, nr_pages); + return __blkdev_direct_IO_simple(iocb, iter, bdev, + nr_pages); + return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages); + } else if (iocb->ki_flags & IOCB_ATOMIC) { + return -EINVAL; } - return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); + return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages)); } static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, @@ -602,6 +604,9 @@ static int blkdev_open(struct inode *inode, struct file *filp) if (IS_ERR(handle)) return PTR_ERR(handle); + if (bdev_can_atomic_write(handle->bdev)) + filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; + if (bdev_nowait(handle->bdev)) filp->f_mode |= FMODE_NOWAIT; @@ -677,6 +682,12 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) return -EOPNOTSUPP; + if (iocb->ki_flags & IOCB_ATOMIC) { + ret = generic_atomic_write_valid(iocb, from); + if (ret) + return ret; + } + size -= iocb->ki_pos; if (iov_iter_count(from) > size) { shorted = iov_iter_count(from) - size; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index ffa469ececb92cec938cdb9ff1998f1405570734..b0ea47a238e8756567bb6e247fffa74542b8d212 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1177,7 +1177,7 @@ static sector_t __max_io_len(struct dm_target *ti, sector_t sector, return len; return min_t(sector_t, len, min(max_sectors ? : queue_max_sectors(ti->table->md->queue), - blk_chunk_sectors_left(target_offset, max_granularity))); + blk_boundary_sectors_left(target_offset, max_granularity))); } static inline sector_t max_io_len(struct dm_target *ti, sector_t sector) diff --git a/fs/aio.c b/fs/aio.c index c3614193d749349f281ecca7e37667a80f126f20..5f5ce804d7ef15eec776157406dc7c215ea229a5 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1467,7 +1467,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res) iocb_put(iocb); } -static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) +static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int rw_type) { int ret; @@ -1493,7 +1493,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) } else req->ki_ioprio = get_current_ioprio(); - ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags); + ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags, rw_type); if (unlikely(ret)) return ret; @@ -1545,7 +1545,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb, struct file *file; int ret; - ret = aio_prep_rw(req, iocb); + ret = aio_prep_rw(req, iocb, WRITE); if (ret) return ret; file = req->ki_filp; @@ -1572,7 +1572,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, struct file *file; int ret; - ret = aio_prep_rw(req, iocb); + ret = aio_prep_rw(req, iocb, READ); if (ret) return ret; file = req->ki_filp; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 3d1cb018f3c4a321273749aafcdd1a7282a8f427..5487b35bc78e2b308dd99a07095fb906332c6c1a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4569,7 +4569,7 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool goto out_iov; init_sync_kiocb(&kiocb, file); - ret = kiocb_set_rw_flags(&kiocb, 0); + ret = kiocb_set_rw_flags(&kiocb, 0, WRITE); if (ret) goto out_iov; kiocb.ki_pos = pos; diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index bcd3f8cf5ea42f0fa0e4967510230f7e5d25b1fa..c654d75a2e7fec2c2e4fc5006e0acf42eba5e6b9 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -251,25 +251,20 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, } /* - * Figure out the bio's operation flags from the dio request, the - * mapping, and whether or not we want FUA. Note that we can end up - * clearing the WRITE_THROUGH flag in the dio request. + * Use a FUA write if we need datasync semantics and this is a pure data I/O + * that doesn't require any metadata updates (including after I/O completion + * such as unwritten extent conversion) and the underlying device either + * doesn't have a volatile write cache or supports FUA. + * This allows us to avoid cache flushes on I/O completion. */ -static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, - const struct iomap *iomap, bool use_fua) +static inline bool iomap_dio_can_use_fua(const struct iomap *iomap, + struct iomap_dio *dio) { - blk_opf_t opflags = REQ_SYNC | REQ_IDLE; - - if (!(dio->flags & IOMAP_DIO_WRITE)) - return REQ_OP_READ; - - opflags |= REQ_OP_WRITE; - if (use_fua) - opflags |= REQ_FUA; - else - dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; - - return opflags; + if (iomap->flags & (IOMAP_F_SHARED | IOMAP_F_DIRTY)) + return false; + if (!(dio->flags & IOMAP_DIO_WRITE_THROUGH)) + return false; + return !bdev_write_cache(iomap->bdev) || bdev_fua(iomap->bdev); } static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, @@ -278,12 +273,11 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, const struct iomap *iomap = &iter->iomap; struct inode *inode = iter->inode; unsigned int fs_block_size = i_blocksize(inode), pad; - loff_t length = iomap_length(iter); + const loff_t length = iomap_length(iter); loff_t pos = iter->pos; - blk_opf_t bio_opf; + blk_opf_t bio_opf = REQ_SYNC | REQ_IDLE; struct bio *bio; bool need_zeroout = false; - bool use_fua = false; int nr_pages, ret = 0; size_t copied = 0; size_t orig_count; @@ -292,34 +286,51 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter)) return -EINVAL; - if (iomap->type == IOMAP_UNWRITTEN) { - dio->flags |= IOMAP_DIO_UNWRITTEN; - need_zeroout = true; - } + if (dio->flags & IOMAP_DIO_WRITE) { + bio_opf |= REQ_OP_WRITE; + + if (iomap->flags & IOMAP_F_ATOMIC_BIO) { + /* + * Ensure that the mapping covers the full write + * length, otherwise it won't be submitted as a single + * bio, which is required to use hardware atomics. + */ + if (length != iter->len) + return -EINVAL; + bio_opf |= REQ_ATOMIC; + } + + if (iomap->type == IOMAP_UNWRITTEN) { + dio->flags |= IOMAP_DIO_UNWRITTEN; + need_zeroout = true; + } + + if (iomap->flags & IOMAP_F_SHARED) + dio->flags |= IOMAP_DIO_COW; - if (iomap->flags & IOMAP_F_SHARED) - dio->flags |= IOMAP_DIO_COW; + if (iomap->flags & IOMAP_F_NEW) { + need_zeroout = true; + } else if (iomap->type == IOMAP_MAPPED) { + if (iomap_dio_can_use_fua(iomap, dio)) + bio_opf |= REQ_FUA; + else + dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; + } - if (iomap->flags & IOMAP_F_NEW) { - need_zeroout = true; - } else if (iomap->type == IOMAP_MAPPED) { /* - * Use a FUA write if we need datasync semantics, this is a pure - * data IO that doesn't require any metadata updates (including - * after IO completion such as unwritten extent conversion) and - * the underlying device either supports FUA or doesn't have - * a volatile write cache. This allows us to avoid cache flushes - * on IO completion. If we can't use writethrough and need to - * sync, disable in-task completions as dio completion will - * need to call generic_write_sync() which will do a blocking - * fsync / cache flush call. + * We can only do deferred completion for pure overwrites that + * don't require additional I/O at completion time. + * + * This rules out writes that need zeroing or extent conversion, + * extend the file size, or issue metadata I/O or cache flushes + * during completion processing. */ - if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && - (dio->flags & IOMAP_DIO_WRITE_THROUGH) && - (bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev))) - use_fua = true; - else if (dio->flags & IOMAP_DIO_NEED_SYNC) + if (need_zeroout || (pos >= i_size_read(inode)) || + ((dio->flags & IOMAP_DIO_NEED_SYNC) && + !(bio_opf & REQ_FUA))) dio->flags &= ~IOMAP_DIO_CALLER_COMP; + } else { + bio_opf |= REQ_OP_READ; } /* @@ -333,18 +344,6 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, if (!iov_iter_count(dio->submit.iter)) goto out; - /* - * We can only do deferred completion for pure overwrites that - * don't require additional IO at completion. This rules out - * writes that need zeroing or extent conversion, extend - * the file size, or issue journal IO or cache flushes - * during completion processing. - */ - if (need_zeroout || - ((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) || - ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) - dio->flags &= ~IOMAP_DIO_CALLER_COMP; - /* * The rules for polled IO completions follow the guidelines as the * ones we set for inline and deferred completions. If none of those @@ -365,7 +364,6 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, * can set up the page vector appropriately for a ZONE_APPEND * operation. */ - bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua); nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); do { @@ -397,12 +395,21 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, } n = bio->bi_iter.bi_size; - if (dio->flags & IOMAP_DIO_WRITE) { - task_io_account_write(n); - } else { - if (dio->flags & IOMAP_DIO_DIRTY) - bio_set_pages_dirty(bio); + if (WARN_ON_ONCE((bio_opf & REQ_ATOMIC) && n != length)) { + /* + * An atomic write bio must cover the complete length, + * which it doesn't, so error. We may need to zero out + * the tail (complete FS block), similar to when + * bio_iov_iter_get_pages() returns an error, above. + */ + ret = -EINVAL; + bio_put(bio); + goto zero_tail; } + if (dio->flags & IOMAP_DIO_WRITE) + task_io_account_write(n); + else if (dio->flags & IOMAP_DIO_DIRTY) + bio_set_pages_dirty(bio); dio->size += n; copied += n; @@ -613,6 +620,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, iomi.flags |= IOMAP_OVERWRITE_ONLY; } + if (iocb->ki_flags & IOCB_ATOMIC) + iomi.flags |= IOMAP_ATOMIC; + /* for data sync or sync, we need sync completion processing */ if (iocb_is_dsync(iocb)) { dio->flags |= IOMAP_DIO_NEED_SYNC; @@ -640,7 +650,17 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (ret != -EAGAIN) { trace_iomap_dio_invalidate_fail(inode, iomi.pos, iomi.len); - ret = -ENOTBLK; + if (iocb->ki_flags & IOCB_ATOMIC) { + /* + * folio invalidation failed, maybe + * this is transient, unlock and see if + * the caller tries again. + */ + ret = -EAGAIN; + } else { + /* fall back to buffered write */ + ret = -ENOTBLK; + } } goto out_free_dio; } diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 3ef694f9489f02080f44e931a838bf0208810059..65d7a58465f05980321ad94e797a6ee95f118810 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued); { IOMAP_REPORT, "REPORT" }, \ { IOMAP_FAULT, "FAULT" }, \ { IOMAP_DIRECT, "DIRECT" }, \ - { IOMAP_NOWAIT, "NOWAIT" } + { IOMAP_NOWAIT, "NOWAIT" }, \ + { IOMAP_ATOMIC, "ATOMIC" } #define IOMAP_F_FLAGS_STRINGS \ { IOMAP_F_NEW, "NEW" }, \ diff --git a/fs/read_write.c b/fs/read_write.c index 27a7729b82f36bf90c88a55b1aa96d984ff33fc5..6d0cc2090c710058974fd0f8485b784f3f716a1c 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -726,7 +726,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, ssize_t ret; init_sync_kiocb(&kiocb, filp); - ret = kiocb_set_rw_flags(&kiocb, flags); + ret = kiocb_set_rw_flags(&kiocb, flags, type); if (ret) return ret; kiocb.ki_pos = (ppos ? *ppos : 0); @@ -1766,6 +1766,26 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out) return 0; } +int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter) +{ + size_t len = iov_iter_count(iter); + + if (!iter_is_ubuf(iter)) + return -EINVAL; + + if (!is_power_of_2(len)) + return -EINVAL; + + if (!IS_ALIGNED(iocb->ki_pos, len)) + return -EINVAL; + + if (!(iocb->ki_flags & IOCB_DIRECT)) + return -EOPNOTSUPP; + + return 0; +} +EXPORT_SYMBOL_GPL(generic_atomic_write_valid); + #ifdef CONFIG_BPF_READAHEAD static void fs_file_read_ctx_init(struct fs_file_read_ctx *ctx, struct file *filp, loff_t pos) diff --git a/fs/stat.c b/fs/stat.c index 5375be5f97ccfdbfc490b1d736af36e47a49a25c..2ce597f9845dd957c18a6e63f4f794a0e438add9 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -89,6 +89,40 @@ void generic_fill_statx_attr(struct inode *inode, struct kstat *stat) } EXPORT_SYMBOL(generic_fill_statx_attr); +/** + * generic_fill_statx_atomic_writes - Fill in atomic writes statx attributes + * @stat: Where to fill in the attribute flags + * @unit_min: Minimum supported atomic write length in bytes + * @unit_max: Maximum supported atomic write length in bytes + * @unit_max_opt: Optimised maximum supported atomic write length in bytes + * + * Fill in the STATX{_ATTR}_WRITE_ATOMIC flags in the kstat structure from + * atomic write unit_min and unit_max values. + */ +void generic_fill_statx_atomic_writes(struct kstat *stat, + unsigned int unit_min, + unsigned int unit_max, + unsigned int unit_max_opt) +{ + /* Confirm that the request type is known */ + stat->result_mask |= STATX_WRITE_ATOMIC; + + /* Confirm that the file attribute type is known */ + stat->attributes_mask |= STATX_ATTR_WRITE_ATOMIC; + + if (unit_min) { + stat->atomic_write_unit_min = unit_min; + stat->atomic_write_unit_max = unit_max; + stat->atomic_write_unit_max_opt = unit_max_opt; + /* Initially only allow 1x segment */ + stat->atomic_write_segments_max = 1; + + /* Confirm atomic writes are actually supported */ + stat->attributes |= STATX_ATTR_WRITE_ATOMIC; + } +} +EXPORT_SYMBOL_GPL(generic_fill_statx_atomic_writes); + /** * vfs_getattr_nosec - getattr without security checks * @path: file to get attributes from @@ -653,6 +687,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) tmp.stx_mnt_id = stat->mnt_id; tmp.stx_dio_mem_align = stat->dio_mem_align; tmp.stx_dio_offset_align = stat->dio_offset_align; + tmp.stx_atomic_write_unit_max_opt = stat->atomic_write_unit_max_opt; return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 9ed22d82c000fd065d60c9821a52053db7b25915..ac35518210f4c3067053875fa7595b06615ffe37 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3333,6 +3333,11 @@ xfs_bmap_compute_alignments( align = xfs_get_cowextsz_hint(ap->ip); else if (ap->datatype & XFS_ALLOC_USERDATA) align = xfs_get_extsz_hint(ap->ip); + + /* Try to align start block to any minimum allocation alignment */ + if (align > 1 && (ap->flags & XFS_BMAPI_EXTSZALIGN)) + args->alignment = align; + if (align) { if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, ap->eof, 0, ap->conv, &ap->offset, diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index e33470e39728d58fec33907f34860db700634d12..8887252dfe07f8831a5d1afdb143a8c69a84bb64 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -87,6 +87,9 @@ struct xfs_bmalloca { /* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ #define XFS_BMAPI_NORMAP (1u << 10) +/* Try to align allocations to the extent size hint */ +#define XFS_BMAPI_EXTSZALIGN (1u << 11) + #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ @@ -98,7 +101,8 @@ struct xfs_bmalloca { { XFS_BMAPI_REMAP, "REMAP" }, \ { XFS_BMAPI_COWFORK, "COWFORK" }, \ { XFS_BMAPI_NODISCARD, "NODISCARD" }, \ - { XFS_BMAPI_NORMAP, "NORMAP" } + { XFS_BMAPI_NORMAP, "NORMAP" },\ + { XFS_BMAPI_EXTSZALIGN, "EXTSZALIGN" } static inline int xfs_bmapi_aflag(int w) diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index 9975b93a7412d8cd041d4fc8cbcc57f9e4c8be00..ba9e0ca6e38169d23f026202d81035c2f607b237 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -61,6 +61,9 @@ xfs_log_calc_trans_resv_for_minlogblocks( xfs_trans_resv_calc(mp, resv); + /* Copy the dynamic transaction reservation types from the running fs */ + resv->tr_atomic_ioend = M_RES(mp)->tr_atomic_ioend; + if (xfs_has_reflink(mp)) { /* * In the early days of reflink, typical log operation counts diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 1bb2891b26ffb3cf860709aa8a8d9ef80795a131..601344d9cd9e596e8e0dd9047587922b68f44206 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -19,6 +19,12 @@ #include "xfs_trans.h" #include "xfs_qm.h" #include "xfs_trans_space.h" +#include "xfs_defer.h" +#include "xfs_bmap_item.h" +#include "xfs_extfree_item.h" +#include "xfs_rmap_item.h" +#include "xfs_refcount_item.h" +#include "xfs_trace.h" #define _ALLOC true #define _FREE false @@ -243,6 +249,24 @@ xfs_rtalloc_block_count( * register overflow from temporaries in the calculations. */ +/* + * Finishing a data device refcount updates (t1): + * the agfs of the ags containing the blocks: nr_ops * sector size + * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_cui_reservation( + struct xfs_mount *mp, + unsigned int nr_ops) +{ + if (!xfs_has_reflink(mp)) + return 0; + + return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), + mp->m_sb.sb_blocksize); +} + /* * Compute the log reservation required to handle the refcount update * transaction. Refcount updates are always done via deferred log items. @@ -257,13 +281,7 @@ xfs_calc_refcountbt_reservation( struct xfs_mount *mp, unsigned int nr_ops) { - unsigned int blksz = XFS_FSB_TO_B(mp, 1); - - if (!xfs_has_reflink(mp)) - return 0; - - return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz); + return xfs_calc_finish_cui_reservation(mp, nr_ops); } /* @@ -348,6 +366,81 @@ xfs_calc_write_reservation_minlogsize( return xfs_calc_write_reservation(mp, true); } +/* + * Finishing an EFI can free the blocks and bmap blocks (t2): + * the agf for each of the ags: nr * sector size + * the agfl for each of the ags: nr * sector size + * the super block to reflect the freed blocks: sector size + * worst case split in allocation btrees per extent assuming nr extents: + * nr exts * 2 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_efi_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr), + mp->m_sb.sb_blocksize); +} + +/* + * Or, if it's a realtime file (t3): + * the agf for each of the ags: 2 * sector size + * the agfl for each of the ags: 2 * sector size + * the super block to reflect the freed blocks: sector size + * the realtime bitmap: + * 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes + * the realtime summary: 2 exts * 1 block + * worst case split in allocation btrees per extent assuming 2 extents: + * 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_rt_efi_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + if (!xfs_has_realtime(mp)) + return 0; + + return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_rtalloc_block_count(mp, nr), + mp->m_sb.sb_blocksize) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr), + mp->m_sb.sb_blocksize); +} + +/* + * Finishing an RUI is the same as an EFI. We can split the rmap btree twice + * on each end of the record, and that can cause the AGFL to be refilled or + * emptied out. + */ +inline unsigned int +xfs_calc_finish_rui_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + if (!xfs_has_rmapbt(mp)) + return 0; + return xfs_calc_finish_efi_reservation(mp, nr); +} + +/* + * In finishing a BUI, we can modify: + * the inode being truncated: inode size + * dquots + * the inode's bmap btree: (max depth + 1) * block size + */ +inline unsigned int +xfs_calc_finish_bui_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + return xfs_calc_inode_res(mp, 1) + XFS_DQUOT_LOGRES + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, + mp->m_sb.sb_blocksize); +} + /* * In truncating a file we free up to two extents at once. We can modify (t1): * the inode being truncated: inode size @@ -380,16 +473,8 @@ xfs_calc_itruncate_reservation( t1 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz); - t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz); - - if (xfs_has_realtime(mp)) { - t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz); - } else { - t3 = 0; - } + t2 = xfs_calc_finish_efi_reservation(mp, 4); + t3 = xfs_calc_finish_rt_efi_reservation(mp, 2); /* * In the early days of reflink, we included enough reservation to log @@ -909,6 +994,15 @@ xfs_calc_sb_reservation( return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); } +STATIC void +xfs_calc_default_atomic_ioend_reservation( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + /* Pick a default that will scale reasonably for the log size. */ + resp->tr_atomic_ioend = resp->tr_itruncate; +} + void xfs_trans_resv_calc( struct xfs_mount *mp, @@ -1025,4 +1119,167 @@ xfs_trans_resv_calc( resp->tr_itruncate.tr_logcount += logcount_adj; resp->tr_write.tr_logcount += logcount_adj; resp->tr_qm_dqalloc.tr_logcount += logcount_adj; + + /* + * Now that we've finished computing the static reservations, we can + * compute the dynamic reservation for atomic writes. + */ + xfs_calc_default_atomic_ioend_reservation(mp, resp); +} + +/* + * Return the per-extent and fixed transaction reservation sizes needed to + * complete an atomic write. + */ +STATIC unsigned int +xfs_calc_atomic_write_ioend_geometry( + struct xfs_mount *mp, + unsigned int *step_size) +{ + const unsigned int efi = xfs_efi_log_space(1); + const unsigned int efd = xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1); + const unsigned int rud = xfs_rud_log_space(); + const unsigned int cui = xfs_cui_log_space(1); + const unsigned int cud = xfs_cud_log_space(); + const unsigned int bui = xfs_bui_log_space(1); + const unsigned int bud = xfs_bud_log_space(); + + /* + * Maximum overhead to complete an atomic write ioend in software: + * remove data fork extent + remove cow fork extent + map extent into + * data fork. + * + * tx0: Creates a BUI and a CUI and that's all it needs. + * + * tx1: Roll to finish the BUI. Need space for the BUD, an RUI, and + * enough space to relog the CUI (== CUI + CUD). + * + * tx2: Roll again to finish the RUI. Need space for the RUD and space + * to relog the CUI. + * + * tx3: Roll again, need space for the CUD and possibly a new EFI. + * + * tx4: Roll again, need space for an EFD. + * + * If the extent referenced by the pair of BUI/CUI items is not the one + * being currently processed, then we need to reserve space to relog + * both items. + */ + const unsigned int tx0 = bui + cui; + const unsigned int tx1 = bud + rui + cui + cud; + const unsigned int tx2 = rud + cui + cud; + const unsigned int tx3 = cud + efi; + const unsigned int tx4 = efd; + const unsigned int relog = bui + bud + cui + cud; + + const unsigned int per_intent = max(max3(tx0, tx1, tx2), + max3(tx3, tx4, relog)); + + /* Overhead to finish one step of each intent item type */ + const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); + const unsigned int f3 = xfs_calc_finish_cui_reservation(mp, 1); + const unsigned int f4 = xfs_calc_finish_bui_reservation(mp, 1); + + /* We only finish one item per transaction in a chain */ + *step_size = max(f4, max3(f1, f2, f3)); + + return per_intent; +} + +/* + * Compute the maximum size (in fsblocks) of atomic writes that we can complete + * given the existing log reservations. + */ +xfs_extlen_t +xfs_calc_max_atomic_write_fsblocks( + struct xfs_mount *mp) +{ + const struct xfs_trans_res *resv = &M_RES(mp)->tr_atomic_ioend; + unsigned int per_intent = 0; + unsigned int step_size = 0; + unsigned int ret = 0; + + if (resv->tr_logres > 0) { + per_intent = xfs_calc_atomic_write_ioend_geometry(mp, + &step_size); + + if (resv->tr_logres >= step_size) + ret = (resv->tr_logres - step_size) / per_intent; + } + + trace_xfs_calc_max_atomic_write_fsblocks(mp, per_intent, step_size, + resv->tr_logres, ret); + + return ret; +} + +/* + * Compute the log blocks and transaction reservation needed to complete an + * atomic write of a given number of blocks. Worst case, each block requires + * separate handling. A return value of 0 means something went wrong. + */ +xfs_extlen_t +xfs_calc_atomic_write_log_geometry( + struct xfs_mount *mp, + xfs_extlen_t blockcount, + unsigned int *new_logres) +{ + struct xfs_trans_res *curr_res = &M_RES(mp)->tr_atomic_ioend; + uint old_logres = curr_res->tr_logres; + unsigned int per_intent, step_size; + unsigned int logres; + xfs_extlen_t min_logblocks; + + ASSERT(blockcount > 0); + + xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp)); + + per_intent = xfs_calc_atomic_write_ioend_geometry(mp, &step_size); + + /* Check for overflows */ + if (check_mul_overflow(blockcount, per_intent, &logres) || + check_add_overflow(logres, step_size, &logres)) + return 0; + + curr_res->tr_logres = logres; + min_logblocks = xfs_log_calc_minimum_size(mp); + curr_res->tr_logres = old_logres; + + trace_xfs_calc_max_atomic_write_log_geometry(mp, per_intent, step_size, + blockcount, min_logblocks, logres); + + *new_logres = logres; + return min_logblocks; +} + +/* + * Compute the transaction reservation needed to complete an out of place + * atomic write of a given number of blocks. + */ +int +xfs_calc_atomic_write_reservation( + struct xfs_mount *mp, + xfs_extlen_t blockcount) +{ + unsigned int new_logres; + xfs_extlen_t min_logblocks; + + /* + * If the caller doesn't ask for a specific atomic write size, then + * use the defaults. + */ + if (blockcount == 0) { + xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp)); + return 0; + } + + min_logblocks = xfs_calc_atomic_write_log_geometry(mp, blockcount, + &new_logres); + if (!min_logblocks || min_logblocks > mp->m_sb.sb_logblocks) + return -EINVAL; + + M_RES(mp)->tr_atomic_ioend.tr_logres = new_logres; + return 0; } diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 0554b9d775d2697868d26e72d2deb138b5a2f034..a87e0798ba0ac016c9561e1105b7d08565b3a3d5 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -48,6 +48,7 @@ struct xfs_trans_resv { struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */ struct xfs_trans_res tr_sb; /* modify superblock */ struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */ + struct xfs_trans_res tr_atomic_ioend; /* untorn write completion */ }; /* shorthand way of accessing reservation structure */ @@ -98,8 +99,28 @@ struct xfs_trans_resv { void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops); +unsigned int xfs_calc_finish_bui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + +unsigned int xfs_calc_finish_efi_reservation(struct xfs_mount *mp, + unsigned int nr_ops); +unsigned int xfs_calc_finish_rt_efi_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + +unsigned int xfs_calc_finish_rui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + +unsigned int xfs_calc_finish_cui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp); unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp); unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp); +xfs_extlen_t xfs_calc_max_atomic_write_fsblocks(struct xfs_mount *mp); +xfs_extlen_t xfs_calc_atomic_write_log_geometry(struct xfs_mount *mp, + xfs_extlen_t blockcount, unsigned int *new_logres); +int xfs_calc_atomic_write_reservation(struct xfs_mount *mp, + xfs_extlen_t blockcount); + #endif /* __XFS_TRANS_RESV_H__ */ diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index b6d63b8bdad5a22d01a2365a80386248c8419c78..55b0b79139e4e827759c4ef2a9c2fcd27af9face 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -76,6 +76,11 @@ xfs_bui_item_size( *nbytes += xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents); } +unsigned int xfs_bui_log_space(unsigned int nr) +{ + return xlog_item_space(1, xfs_bui_log_format_sizeof(nr)); +} + /* * This is called to fill in the vector of log iovecs for the * given bui log item. We use only 1 iovec, and we point that @@ -167,6 +172,11 @@ xfs_bud_item_size( *nbytes += sizeof(struct xfs_bud_log_format); } +unsigned int xfs_bud_log_space(void) +{ + return xlog_item_space(1, sizeof(struct xfs_bud_log_format)); +} + /* * This is called to fill in the vector of log iovecs for the * given bud log item. We use only 1 iovec, and we point that diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h index 3fafd3881a0bb2f372694e4680c7e28a2dc3ca13..d34eabd254fa1c7463e3fbf472164514276725ce 100644 --- a/fs/xfs/xfs_bmap_item.h +++ b/fs/xfs/xfs_bmap_item.h @@ -68,4 +68,7 @@ struct xfs_bud_log_item { extern struct kmem_cache *xfs_bui_cache; extern struct kmem_cache *xfs_bud_cache; +unsigned int xfs_bui_log_space(unsigned int nr); +unsigned int xfs_bud_log_space(void); + #endif /* __XFS_BMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 65365a554c4a88e9e218528d53d0db1e09184d5b..9e426d1d55aa42861e275ebd2ebf7b2886ff2b2d 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1961,8 +1961,38 @@ xfs_free_buftarg( kmem_free(btp); } +/* + * Configure this buffer target for hardware-assisted atomic writes if the + * underlying block device supports is congruent with the filesystem geometry. + */ +static inline void +xfs_configure_buftarg_atomic_writes( + struct xfs_buftarg *btp) +{ + struct xfs_mount *mp = btp->bt_mount; + unsigned int min_bytes, max_bytes; + + min_bytes = bdev_atomic_write_unit_min_bytes(btp->bt_bdev); + max_bytes = bdev_atomic_write_unit_max_bytes(btp->bt_bdev); + + /* + * Ignore atomic write geometry that is nonsense or doesn't even cover + * a single fsblock. + */ + if (min_bytes > max_bytes || + min_bytes > mp->m_sb.sb_blocksize || + max_bytes < mp->m_sb.sb_blocksize) { + min_bytes = 0; + max_bytes = 0; + } + + btp->bt_bdev_awu_min = min_bytes; + btp->bt_bdev_awu_max = max_bytes; +} + +/* Configure a buffer target that abstracts a block device. */ int -xfs_setsize_buftarg( +xfs_configure_buftarg( xfs_buftarg_t *btp, unsigned int sectorsize) { @@ -1981,6 +2011,9 @@ xfs_setsize_buftarg( btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; + if (bdev_can_atomic_write(btp->bt_bdev)) + xfs_configure_buftarg_atomic_writes(btp); + return 0; } @@ -1993,7 +2026,7 @@ STATIC int xfs_setsize_buftarg_early( xfs_buftarg_t *btp) { - return xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)); + return xfs_configure_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)); } struct xfs_buftarg * diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index f4bd2742a8c10e7056d81e300e0c1fa9c637f51c..27ae5368dee22a9d7271f1146c47c0177e986dfc 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -115,6 +115,10 @@ typedef struct xfs_buftarg { struct percpu_counter bt_io_count; struct ratelimit_state bt_ioerror_rl; + /* Atomic write unit values */ + unsigned int bt_bdev_awu_min; + unsigned int bt_bdev_awu_max; + KABI_EXTEND(struct shrinker_v2 *bt_shrinker) } xfs_buftarg_t; @@ -372,7 +376,7 @@ struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp, extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_buftarg_wait(struct xfs_buftarg *); extern void xfs_buftarg_drain(struct xfs_buftarg *); -extern int xfs_setsize_buftarg(struct xfs_buftarg *, unsigned int); +int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index fb7b76f6289a3aad5af6e635b011ceab1d850576..f4e4742e200e63f67c31291296c3558983ba3b64 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -75,6 +75,25 @@ xfs_buf_item_straddle( return false; } +/* + * Compute the worst case log item overhead for an invalidated buffer with the + * given map count and block size. + */ +unsigned int +xfs_buf_inval_log_space( + unsigned int map_count, + unsigned int blocksize) +{ + unsigned int chunks = DIV_ROUND_UP(blocksize, XFS_BLF_CHUNK); + unsigned int bitmap_size = DIV_ROUND_UP(chunks, NBWORD); + unsigned int ret = + offsetof(struct xfs_buf_log_format, blf_data_map) + + (bitmap_size * sizeof_field(struct xfs_buf_log_format, + blf_data_map[0])); + + return ret * map_count; +} + /* * Return the number of log iovecs and space needed to log the given buf log * item segment. diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 51e8ec085786cc5d27ee620409abbf4f829b4197..dfec034ae30fa328d4e13e35bdaf0a172ecf0a2a 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -75,6 +75,9 @@ static inline void xfs_buf_dquot_io_fail(struct xfs_buf *bp) void xfs_buf_iodone(struct xfs_buf *); bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec); +unsigned int xfs_buf_inval_log_space(unsigned int map_count, + unsigned int blocksize); + extern struct kmem_cache *xfs_buf_item_cache; #endif /* __XFS_BUF_ITEM_H__ */ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index c9908fb337657a144ed011db7fd4a3c5d9973c50..e503ffafa518ba17a29f5abc585845fc3b5ae8f6 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -78,6 +78,11 @@ xfs_efi_item_size( *nbytes += xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents); } +unsigned int xfs_efi_log_space(unsigned int nr) +{ + return xlog_item_space(1, xfs_efi_log_format_sizeof(nr)); +} + /* * This is called to fill in the vector of log iovecs for the * given efi log item. We use only 1 iovec, and we point that @@ -248,6 +253,11 @@ xfs_efd_item_size( *nbytes += xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents); } +unsigned int xfs_efd_log_space(unsigned int nr) +{ + return xlog_item_space(1, xfs_efd_log_format_sizeof(nr)); +} + /* * This is called to fill in the vector of log iovecs for the * given efd log item. We use only 1 iovec, and we point that diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index da6a5afa607cf4ed70770a80f35dfc1b1e4a5715..d535f8b6f9ffaf16151a2e9eb447e250443757ea 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -88,4 +88,7 @@ xfs_efd_log_item_sizeof( extern struct kmem_cache *xfs_efi_cache; extern struct kmem_cache *xfs_efd_cache; +unsigned int xfs_efi_log_space(unsigned int nr); +unsigned int xfs_efd_log_space(unsigned int nr); + #endif /* __XFS_EXTFREE_ITEM_H__ */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e0cef2cb5c53812be34f4ae60a4dde19bc68a439..ff7326d8444437685ab55e18fb44a9249887aa4d 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -516,7 +516,10 @@ xfs_dio_write_end_io( nofs_flag = memalloc_nofs_save(); if (flags & IOMAP_DIO_COW) { - error = xfs_reflink_end_cow(ip, offset, size); + if (iocb->ki_flags & IOCB_ATOMIC) + error = xfs_reflink_end_atomic_cow(ip, offset, size); + else + error = xfs_reflink_end_cow(ip, offset, size); if (error) goto out; } @@ -608,6 +611,72 @@ xfs_file_dio_write_aligned( return ret; } +/* + * Handle block atomic writes + * + * Two methods of atomic writes are supported: + * - REQ_ATOMIC-based, which would typically use some form of HW offload in the + * disk + * - COW-based, which uses a COW fork as a staging extent for data updates + * before atomically updating extent mappings for the range being written + * + */ +static noinline ssize_t +xfs_file_dio_write_atomic( + struct xfs_inode *ip, + struct kiocb *iocb, + struct iov_iter *from) +{ + unsigned int iolock = XFS_IOLOCK_SHARED; + ssize_t ret, ocount = iov_iter_count(from); + const struct iomap_ops *dops; + + /* + * HW offload should be faster, so try that first if it is already + * known that the write length is not too large. + */ + if (ocount > xfs_inode_buftarg(ip)->bt_bdev_awu_max) + dops = &xfs_atomic_write_cow_iomap_ops; + else + dops = &xfs_direct_write_iomap_ops; + +retry: + ret = xfs_ilock_iocb_for_write(iocb, &iolock); + if (ret) + return ret; + + ret = xfs_file_write_checks(iocb, from, &iolock); + if (ret) + goto out_unlock; + + /* Demote similar to xfs_file_dio_write_aligned() */ + if (iolock == XFS_IOLOCK_EXCL) { + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); + iolock = XFS_IOLOCK_SHARED; + } + + trace_xfs_file_direct_write(iocb, from); + ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops, + 0, NULL, 0); + + /* + * The retry mechanism is based on the ->iomap_begin method returning + * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not + * possible. The REQ_ATOMIC-based method typically not be possible if + * the write spans multiple extents or the disk blocks are misaligned. + */ + if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) { + xfs_iunlock(ip, iolock); + dops = &xfs_atomic_write_cow_iomap_ops; + goto retry; + } + +out_unlock: + if (iolock) + xfs_iunlock(ip, iolock); + return ret; +} + /* * Handle block unaligned direct I/O writes * @@ -712,6 +781,8 @@ xfs_file_dio_write( return -EINVAL; if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask) return xfs_file_dio_write_unaligned(ip, iocb, from); + if (iocb->ki_flags & IOCB_ATOMIC) + return xfs_file_dio_write_atomic(ip, iocb, from); return xfs_file_dio_write_aligned(ip, iocb, from); } @@ -841,6 +912,18 @@ xfs_file_write_iter( if (IS_DAX(inode)) return xfs_file_dax_write(iocb, from); + if (iocb->ki_flags & IOCB_ATOMIC) { + if (ocount < xfs_get_atomic_write_min(ip)) + return -EINVAL; + + if (ocount > xfs_get_atomic_write_max(ip)) + return -EINVAL; + + ret = generic_atomic_write_valid(iocb, from); + if (ret) + return ret; + } + if (iocb->ki_flags & IOCB_DIRECT) { /* * Allow a directio write to fall back to a buffered @@ -1235,6 +1318,8 @@ xfs_file_open( return -EIO; file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC | FMODE_DIO_PARALLEL_WRITE | FMODE_CAN_ODIRECT; + if (xfs_get_atomic_write_min(XFS_I(inode)) > 0) + file->f_mode |= FMODE_CAN_ATOMIC_WRITE; return generic_file_open(inode, file); } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 8a661a66f219560f81c83c32859e2ad26bedba56..b5bd4138cbfa951b4ec2b33595093c9e1acf80a7 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -321,6 +321,11 @@ static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip) (XFS_IS_REALTIME_INODE(ip) ? \ (ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp) +static inline bool xfs_inode_can_hw_atomic_write(const struct xfs_inode *ip) +{ + return xfs_inode_buftarg(ip)->bt_bdev_awu_max > 0; +} + /* * In-core inode flags. */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 40d02a1450d3cfc306c1560da64633feb69496d7..0cd69c08e897bfb97da6c4df5776accd9a8c01be 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -27,6 +27,7 @@ #include "xfs_dquot_item.h" #include "xfs_dquot.h" #include "xfs_reflink.h" +#include "xfs_icache.h" #define XFS_ALLOC_ALIGN(mp, off) \ (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log) @@ -762,6 +763,38 @@ imap_spans_range( return true; } +static bool +xfs_bmap_hw_atomic_write_possible( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + xfs_fileoff_t offset_fsb, + xfs_fileoff_t end_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fsize_t len = XFS_FSB_TO_B(mp, end_fsb - offset_fsb); + + /* + * atomic writes are required to be naturally aligned for disk blocks, + * which ensures that we adhere to block layer rules that we won't + * straddle any boundary or violate write alignment requirement. + */ + if (!IS_ALIGNED(imap->br_startblock, imap->br_blockcount)) + return false; + + /* + * Spanning multiple extents would mean that multiple BIOs would be + * issued, and so would lose atomicity required for REQ_ATOMIC-based + * atomics. + */ + if (!imap_spans_range(imap, offset_fsb, end_fsb)) + return false; + + /* + * The ->iomap_begin caller should ensure this, but check anyway. + */ + return len <= xfs_inode_buftarg(ip)->bt_bdev_awu_max; +} + static int xfs_direct_write_iomap_begin( struct inode *inode, @@ -776,9 +809,11 @@ xfs_direct_write_iomap_begin( struct xfs_bmbt_irec imap, cmap; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); + xfs_fileoff_t orig_end_fsb = end_fsb; int nimaps = 1, error = 0; bool shared = false; u16 iomap_flags = 0; + bool needs_alloc; unsigned int lockmode = XFS_ILOCK_SHARED; u64 seq; @@ -795,6 +830,10 @@ xfs_direct_write_iomap_begin( if (offset + length > i_size_read(inode)) iomap_flags |= IOMAP_F_DIRTY; + /* HW-offload atomics are always used in this path */ + if (flags & IOMAP_ATOMIC) + iomap_flags |= IOMAP_F_ATOMIC_BIO; + error = xfs_ilock_for_iomap(ip, flags, &lockmode); if (error) return error; @@ -815,13 +854,37 @@ xfs_direct_write_iomap_begin( (flags & IOMAP_DIRECT) || IS_DAX(inode)); if (error) goto out_unlock; - if (shared) + if (shared) { + if ((flags & IOMAP_ATOMIC) && + !xfs_bmap_hw_atomic_write_possible(ip, &cmap, + offset_fsb, end_fsb)) { + error = -ENOPROTOOPT; + goto out_unlock; + } goto out_found_cow; + } end_fsb = imap.br_startoff + imap.br_blockcount; length = XFS_FSB_TO_B(mp, end_fsb) - offset; } - if (imap_needs_alloc(inode, flags, &imap, nimaps)) + needs_alloc = imap_needs_alloc(inode, flags, &imap, nimaps); + + if (flags & IOMAP_ATOMIC) { + error = -ENOPROTOOPT; + /* + * If we allocate less than what is required for the write + * then we may end up with multiple extents, which means that + * REQ_ATOMIC-based cannot be used, so avoid this possibility. + */ + if (needs_alloc && orig_end_fsb - offset_fsb > 1) + goto out_unlock; + + if (!xfs_bmap_hw_atomic_write_possible(ip, &imap, offset_fsb, + orig_end_fsb)) + goto out_unlock; + } + + if (needs_alloc) goto allocate_blocks; /* @@ -909,6 +972,134 @@ const struct iomap_ops xfs_direct_write_iomap_ops = { .iomap_begin = xfs_direct_write_iomap_begin, }; +static int +xfs_atomic_write_cow_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t length, + unsigned flags, + struct iomap *iomap, + struct iomap *srcmap) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + const xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); + xfs_filblks_t count_fsb = end_fsb - offset_fsb; + int nmaps = 1; + xfs_filblks_t resaligned; + struct xfs_bmbt_irec cmap; + struct xfs_iext_cursor icur; + struct xfs_trans *tp; + unsigned int dblocks = 0, rblocks = 0; + int error; + u64 seq; + + ASSERT(flags & IOMAP_WRITE); + ASSERT(flags & IOMAP_DIRECT); + + if (xfs_is_shutdown(mp)) + return -EIO; + + if (!xfs_can_sw_atomic_write(mp)) { + ASSERT(xfs_can_sw_atomic_write(mp)); + return -EINVAL; + } + + /* blocks are always allocated in this path */ + if (flags & IOMAP_NOWAIT) + return -EAGAIN; + + trace_xfs_iomap_atomic_write_cow(ip, offset, length); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + if (!ip->i_cowfp) { + ASSERT(!xfs_is_reflink_inode(ip)); + xfs_ifork_init_cow(ip); + } + + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) + cmap.br_startoff = end_fsb; + if (cmap.br_startoff <= offset_fsb) { + xfs_trim_extent(&cmap, offset_fsb, count_fsb); + goto found; + } + + end_fsb = cmap.br_startoff; + count_fsb = end_fsb - offset_fsb; + + resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, + xfs_get_cowextsz_hint(ip)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + if (XFS_IS_REALTIME_INODE(ip)) { + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + rblocks = resaligned; + } else { + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); + rblocks = 0; + } + + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks, + rblocks, false, &tp); + if (error) + return error; + + /* extent layout could have changed since the unlock, so check again */ + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) + cmap.br_startoff = end_fsb; + if (cmap.br_startoff <= offset_fsb) { + xfs_trim_extent(&cmap, offset_fsb, count_fsb); + xfs_trans_cancel(tp); + goto found; + } + + /* + * Allocate the entire reservation as unwritten blocks. + * + * Use XFS_BMAPI_EXTSZALIGN to hint at aligning new extents according to + * extszhint, such that there will be a greater chance that future + * atomic writes to that same range will be aligned (and don't require + * this COW-based method). + */ + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, + XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC | + XFS_BMAPI_EXTSZALIGN, 0, &cmap, &nmaps); + if (error) { + xfs_trans_cancel(tp); + goto out_unlock; + } + + xfs_inode_set_cowblocks_tag(ip); + error = xfs_trans_commit(tp); + if (error) + goto out_unlock; + +found: + if (cmap.br_state != XFS_EXT_NORM) { + error = xfs_reflink_convert_cow_locked(ip, offset_fsb, + count_fsb); + if (error) + goto out_unlock; + cmap.br_state = XFS_EXT_NORM; + } + + length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); + trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +const struct iomap_ops xfs_atomic_write_cow_iomap_ops = { + .iomap_begin = xfs_atomic_write_cow_iomap_begin, +}; + static int xfs_dax_write_iomap_end( struct inode *inode, diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 4da13440bae9bd7e67f5044b8bfc3544d57e8be9..608ef9a58e8c6d931ab02008ab8c4e19f5f4a092 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -54,5 +54,6 @@ extern const struct iomap_ops xfs_read_iomap_ops; extern const struct iomap_ops xfs_seek_iomap_ops; extern const struct iomap_ops xfs_xattr_iomap_ops; extern const struct iomap_ops xfs_dax_write_iomap_ops; +extern const struct iomap_ops xfs_atomic_write_cow_iomap_ops; #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 9b7f49ad3e52992658bfbd80a73b34fee6f5e065..57223706f0287f6abd37977975099c1d447f135d 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -549,6 +549,95 @@ xfs_stat_blksize( return PAGE_SIZE; } +static void +xfs_report_dioalign( + struct xfs_inode *ip, + struct kstat *stat) +{ + struct xfs_buftarg *target = xfs_inode_buftarg(ip); + struct block_device *bdev = target->bt_bdev; + + stat->result_mask |= STATX_DIOALIGN; + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; + stat->dio_offset_align = bdev_logical_block_size(bdev); +} + +unsigned int +xfs_get_atomic_write_min( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + /* + * If we can complete an atomic write via atomic out of place writes, + * then advertise a minimum size of one fsblock. Without this + * mechanism, we can only guarantee atomic writes up to a single LBA. + * + * If out of place writes are not available, we can guarantee an atomic + * write of exactly one single fsblock if the bdev will make that + * guarantee for us. + */ + if (xfs_inode_can_hw_atomic_write(ip) || xfs_can_sw_atomic_write(mp)) + return mp->m_sb.sb_blocksize; + + return 0; +} + +unsigned int +xfs_get_atomic_write_max( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + /* + * If out of place writes are not available, we can guarantee an atomic + * write of exactly one single fsblock if the bdev will make that + * guarantee for us. + */ + if (!xfs_can_sw_atomic_write(mp)) { + if (xfs_inode_can_hw_atomic_write(ip)) + return mp->m_sb.sb_blocksize; + return 0; + } + + /* + * If we can complete an atomic write via atomic out of place writes, + * then advertise a maximum size of whatever we can complete through + * that means. Hardware support is reported via max_opt, not here. + */ + return XFS_FSB_TO_B(mp, mp->m_awu_max); +} + +unsigned int +xfs_get_atomic_write_max_opt( + struct xfs_inode *ip) +{ + unsigned int awu_max = xfs_get_atomic_write_max(ip); + + /* if the max is 1x block, then just keep behaviour that opt is 0 */ + if (awu_max <= ip->i_mount->m_sb.sb_blocksize) + return 0; + + /* + * Advertise the maximum size of an atomic write that we can tell the + * block device to perform for us. In general the bdev limit will be + * less than our out of place write limit, but we don't want to exceed + * the awu_max. + */ + return min(awu_max, xfs_inode_buftarg(ip)->bt_bdev_awu_max); +} + +static void +xfs_report_atomic_write( + struct xfs_inode *ip, + struct kstat *stat) +{ + generic_fill_statx_atomic_writes(stat, + xfs_get_atomic_write_min(ip), + xfs_get_atomic_write_max(ip), + xfs_get_atomic_write_max_opt(ip)); +} + STATIC int xfs_vn_getattr( struct mnt_idmap *idmap, @@ -614,14 +703,10 @@ xfs_vn_getattr( stat->rdev = inode->i_rdev; break; case S_IFREG: - if (request_mask & STATX_DIOALIGN) { - struct xfs_buftarg *target = xfs_inode_buftarg(ip); - struct block_device *bdev = target->bt_bdev; - - stat->result_mask |= STATX_DIOALIGN; - stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; - stat->dio_offset_align = bdev_logical_block_size(bdev); - } + if (request_mask & STATX_DIOALIGN) + xfs_report_dioalign(ip, stat); + if (request_mask & STATX_WRITE_ATOMIC) + xfs_report_atomic_write(ip, stat); fallthrough; default: stat->blksize = xfs_stat_blksize(ip); diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index 52d6d510a21d9d6a50f2fcdf8a36695f323b7d4e..c88c74bf1198874a0d41a64221fc61b7e4b979e2 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -15,5 +15,8 @@ int xfs_vn_setattr_size(struct mnt_idmap *idmap, int xfs_inode_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr); +unsigned int xfs_get_atomic_write_min(struct xfs_inode *ip); +unsigned int xfs_get_atomic_write_max(struct xfs_inode *ip); +unsigned int xfs_get_atomic_write_max_opt(struct xfs_inode *ip); #endif /* __XFS_IOPS_H__ */ diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 3d36066215bef18a6e8b063d594038ed25201964..6425a35f47a47d9a7ff344fb7652318b65136edf 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -315,9 +315,7 @@ xlog_cil_alloc_shadow_bufs( * Then round nbytes up to 64-bit alignment so that the initial * buffer alignment is easy to calculate and verify. */ - nbytes += niovecs * - (sizeof(uint64_t) + sizeof(struct xlog_op_header)); - nbytes = round_up(nbytes, sizeof(uint64_t)); + nbytes = xlog_item_space(niovecs, nbytes); /* * The data buffer needs to start 64-bit aligned, so round up diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index d5ca8619bb623f85258cf39fa9de2bf51915b4bb..1665e7f526d4dba33a46ccbae28a91e246fed9ff 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -699,4 +699,17 @@ xlog_kvmalloc( return p; } +/* + * Given a count of iovecs and space for a log item, compute the space we need + * in the log to store that data plus the log headers. + */ +static inline unsigned int +xlog_item_space( + unsigned int niovecs, + unsigned int nbytes) +{ + nbytes += niovecs * (sizeof(uint64_t) + sizeof(struct xlog_op_header)); + return round_up(nbytes, sizeof(uint64_t)); +} + #endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index f7f34f4bdddad28bca95b1e0d9656606d98e110e..6f8ca03f34ca5f49f8a25dbcd564ed69136bdd2e 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -620,6 +620,123 @@ xfs_agbtree_compute_maxlevels( mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels); } +/* Maximum atomic write IO size that the kernel allows. */ +static inline xfs_extlen_t xfs_calc_atomic_write_max(struct xfs_mount *mp) +{ + return rounddown_pow_of_two(XFS_B_TO_FSB(mp, MAX_RW_COUNT)); +} + +static inline unsigned int max_pow_of_two_factor(const unsigned int nr) +{ + return 1 << (ffs(nr) - 1); +} + +/* + * If the data device advertises atomic write support, limit the size of data + * device atomic writes to the greatest power-of-two factor of the AG size so + * that every atomic write unit aligns with the start of every AG. This is + * required so that the per-AG allocations for an atomic write will always be + * aligned compatibly with the alignment requirements of the storage. + * + * If the data device doesn't advertise atomic writes, then there are no + * alignment restrictions and the largest out-of-place write we can do + * ourselves is the number of blocks that user files can allocate from any AG. + */ +static inline xfs_extlen_t xfs_calc_perag_awu_max(struct xfs_mount *mp) +{ + if (mp->m_ddev_targp->bt_bdev_awu_min > 0) + return max_pow_of_two_factor(mp->m_sb.sb_agblocks); + return rounddown_pow_of_two(mp->m_ag_max_usable); +} + +/* Compute the maximum atomic write unit size for each section. */ +static inline void +xfs_calc_atomic_write_unit_max( + struct xfs_mount *mp) +{ + const xfs_extlen_t max_write = xfs_calc_atomic_write_max(mp); + const xfs_extlen_t max_ioend = xfs_reflink_max_atomic_cow(mp); + const xfs_extlen_t max_agsize = xfs_calc_perag_awu_max(mp); + + mp->m_awu_max = min3(max_write, max_ioend, max_agsize); + + trace_xfs_calc_atomic_write_unit_max(mp, max_write, max_ioend, + max_agsize); +} + +/* + * Try to set the atomic write maximum to a new value that we got from + * userspace via mount option. + */ +int +xfs_set_max_atomic_write_opt( + struct xfs_mount *mp, + unsigned long long new_max_bytes) +{ + const xfs_filblks_t new_max_fsbs = XFS_B_TO_FSBT(mp, new_max_bytes); + const xfs_extlen_t max_write = xfs_calc_atomic_write_max(mp); + const xfs_extlen_t max_group = mp->m_sb.sb_agblocks; + const xfs_extlen_t max_group_write = xfs_calc_perag_awu_max(mp); + int error; + + if (new_max_bytes == 0) + goto set_limit; + + ASSERT(max_write <= U32_MAX); + + /* generic_atomic_write_valid enforces power of two length */ + if (!is_power_of_2(new_max_bytes)) { + xfs_warn(mp, + "max atomic write size of %llu bytes is not a power of 2", + new_max_bytes); + return -EINVAL; + } + + if (new_max_bytes & mp->m_blockmask) { + xfs_warn(mp, + "max atomic write size of %llu bytes not aligned with fsblock", + new_max_bytes); + return -EINVAL; + } + + if (new_max_fsbs > max_write) { + xfs_warn(mp, + "max atomic write size of %lluk cannot be larger than max write size %lluk", + new_max_bytes >> 10, + XFS_FSB_TO_B(mp, max_write) >> 10); + return -EINVAL; + } + + if (new_max_fsbs > max_group) { + xfs_warn(mp, + "max atomic write size of %lluk cannot be larger than allocation group size %lluk", + new_max_bytes >> 10, + XFS_FSB_TO_B(mp, max_group) >> 10); + return -EINVAL; + } + + if (new_max_fsbs > max_group_write) { + xfs_warn(mp, + "max atomic write size of %lluk cannot be larger than max allocation group write size %lluk", + new_max_bytes >> 10, + XFS_FSB_TO_B(mp, max_group_write) >> 10); + return -EINVAL; + } + +set_limit: + error = xfs_calc_atomic_write_reservation(mp, new_max_fsbs); + if (error) { + xfs_warn(mp, + "cannot support completing atomic writes of %lluk", + new_max_bytes >> 10); + return error; + } + + xfs_calc_atomic_write_unit_max(mp); + mp->m_awu_max_bytes = new_max_bytes; + return 0; +} + /* * This function does the following on an initial mount of a file system: * - reads the superblock from disk and init the mount struct @@ -995,6 +1112,15 @@ xfs_mountfs( goto out_agresv; } + /* + * Pre-calculate atomic write unit max. This involves computations + * derived from transaction reservations, so we must do this after the + * log is fully initialized. + */ + error = xfs_set_max_atomic_write_opt(mp, mp->m_awu_max_bytes); + if (error) + goto out_agresv; + return 0; out_agresv: diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index c8f0f5e9e966a1c9ce90259f8a86761a531e6c4b..7cc6e8dcead079612096a5838ba48687ba980f43 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -161,6 +161,9 @@ typedef struct xfs_mount { bool m_finobt_nores; /* no per-AG finobt resv. */ bool m_update_sb; /* sb needs update in mount */ + /* max_atomic_write mount option value */ + unsigned long long m_awu_max_bytes; + /* * Bitsets of per-fs metadata that have been checked and/or are sick. * Callers must hold m_sb_lock to access these two fields. @@ -251,6 +254,12 @@ typedef struct xfs_mount { /* cpus that have inodes queued for inactivation */ struct cpumask m_inodegc_cpumask; + /* + * Maximum length of an atomic write for files stored in this + * collection of allocation groups, in fsblocks. + */ + xfs_extlen_t m_awu_max; + KABI_EXTEND(struct shrinker_v2 *m_inodegc_shrinker) } xfs_mount_t; @@ -353,6 +362,11 @@ __XFS_HAS_FEAT(bigtime, BIGTIME) __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR) __XFS_HAS_FEAT(large_extent_counts, NREXT64) +static inline bool xfs_can_sw_atomic_write(struct xfs_mount *mp) +{ + return xfs_has_reflink(mp); +} + /* * Mount features * @@ -568,4 +582,7 @@ int xfs_add_incompat_log_feature(struct xfs_mount *mp, uint32_t feature); bool xfs_clear_incompat_log_features(struct xfs_mount *mp); void xfs_mod_delalloc(struct xfs_mount *mp, int64_t delta); +int xfs_set_max_atomic_write_opt(struct xfs_mount *mp, + unsigned long long new_max_bytes); + #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index f1b2592238022e4216e9517a2a0011275622c082..57cdc8c3629ccc4396920735d01d1b1227ec6215 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -75,6 +75,11 @@ xfs_cui_item_size( *nbytes += xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents); } +unsigned int xfs_cui_log_space(unsigned int nr) +{ + return xlog_item_space(1, xfs_cui_log_format_sizeof(nr)); +} + /* * This is called to fill in the vector of log iovecs for the * given cui log item. We use only 1 iovec, and we point that @@ -173,6 +178,11 @@ xfs_cud_item_size( *nbytes += sizeof(struct xfs_cud_log_format); } +unsigned int xfs_cud_log_space(void) +{ + return xlog_item_space(1, sizeof(struct xfs_cud_log_format)); +} + /* * This is called to fill in the vector of log iovecs for the * given cud log item. We use only 1 iovec, and we point that diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h index eb0ab13682d0b4702f5c7bd3696087709b79ed45..7ad490a775c48f00f78c4a55eb8eb209a5694e10 100644 --- a/fs/xfs/xfs_refcount_item.h +++ b/fs/xfs/xfs_refcount_item.h @@ -71,4 +71,7 @@ struct xfs_cud_log_item { extern struct kmem_cache *xfs_cui_cache; extern struct kmem_cache *xfs_cud_cache; +unsigned int xfs_cui_log_space(unsigned int nr); +unsigned int xfs_cud_log_space(void); + #endif /* __XFS_REFCOUNT_ITEM_H__ */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 3431d0d8b6f3a1ed9deba7f741b3913f5cc9dd65..493184f89b028185c634bfbbf3fdf0f312f5a716 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -241,7 +241,7 @@ xfs_bmap_trim_cow( return xfs_reflink_trim_around_shared(ip, imap, shared); } -static int +int xfs_reflink_convert_cow_locked( struct xfs_inode *ip, xfs_fileoff_t offset_fsb, @@ -702,41 +702,25 @@ xfs_reflink_cancel_cow_range( * requirements as low as possible. */ STATIC int -xfs_reflink_end_cow_extent( +xfs_reflink_end_cow_extent_locked( + struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *offset_fsb, xfs_fileoff_t end_fsb) { struct xfs_iext_cursor icur; struct xfs_bmbt_irec got, del, data; - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); - unsigned int resblks; int nmaps; int error; - resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, - XFS_TRANS_RESERVE, &tp); - if (error) - return error; - - /* - * Lock the inode. We have to ijoin without automatic unlock because - * the lead transaction is the refcountbt record deletion; the data - * fork update follows as a deferred log item. - */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_REFLINK_END_COW_CNT); if (error == -EFBIG) error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_REFLINK_END_COW_CNT); if (error) - goto out_cancel; + return error; /* * In case of racing, overlapping AIO writes no COW extents might be @@ -746,7 +730,7 @@ xfs_reflink_end_cow_extent( if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) || got.br_startoff >= end_fsb) { *offset_fsb = end_fsb; - goto out_cancel; + return 0; } /* @@ -760,7 +744,7 @@ xfs_reflink_end_cow_extent( if (!xfs_iext_next_extent(ifp, &icur, &got) || got.br_startoff >= end_fsb) { *offset_fsb = end_fsb; - goto out_cancel; + return 0; } } del = got; @@ -771,7 +755,7 @@ xfs_reflink_end_cow_extent( error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data, &nmaps, 0); if (error) - goto out_cancel; + return error; /* We can only remap the smaller of the two extent sizes. */ data.br_blockcount = min(data.br_blockcount, del.br_blockcount); @@ -801,7 +785,7 @@ xfs_reflink_end_cow_extent( error = xfs_bunmapi(NULL, ip, data.br_startoff, data.br_blockcount, 0, 1, &done); if (error) - goto out_cancel; + return error; ASSERT(done); } @@ -818,17 +802,45 @@ xfs_reflink_end_cow_extent( /* Remove the mapping from the CoW fork. */ xfs_bmap_del_extent_cow(ip, &icur, &got, &del); - error = xfs_trans_commit(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - return error; - /* Update the caller about how much progress we made. */ *offset_fsb = del.br_startoff + del.br_blockcount; return 0; +} -out_cancel: - xfs_trans_cancel(tp); +/* + * Remap part of the CoW fork into the data fork. + * + * We aim to remap the range starting at @offset_fsb and ending at @end_fsb + * into the data fork; this function will remap what it can (at the end of the + * range) and update @end_fsb appropriately. Each remap gets its own + * transaction because we can end up merging and splitting bmbt blocks for + * every remap operation and we'd like to keep the block reservation + * requirements as low as possible. + */ +STATIC int +xfs_reflink_end_cow_extent( + struct xfs_inode *ip, + xfs_fileoff_t *offset_fsb, + xfs_fileoff_t end_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + unsigned int resblks; + int error; + + resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, + XFS_TRANS_RESERVE, &tp); + if (error) + return error; + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + error = xfs_reflink_end_cow_extent_locked(tp, ip, offset_fsb, end_fsb); + if (error) + xfs_trans_cancel(tp); + else + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -891,6 +903,78 @@ xfs_reflink_end_cow( return error; } +/* + * Fully remap all of the file's data fork at once, which is the critical part + * in achieving atomic behaviour. + * The regular CoW end path does not use function as to keep the block + * reservation per transaction as low as possible. + */ +int +xfs_reflink_end_atomic_cow( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + xfs_fileoff_t offset_fsb; + xfs_fileoff_t end_fsb; + int error = 0; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + unsigned int resblks; + + trace_xfs_reflink_end_cow(ip, offset, count); + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + end_fsb = XFS_B_TO_FSB(mp, offset + count); + + /* + * Each remapping operation could cause a btree split, so in the worst + * case that's one for each block. + */ + resblks = (end_fsb - offset_fsb) * + XFS_NEXTENTADD_SPACE_RES(mp, 1, XFS_DATA_FORK); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_atomic_ioend, resblks, 0, + XFS_TRANS_RESERVE, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + while (end_fsb > offset_fsb && !error) { + error = xfs_reflink_end_cow_extent_locked(tp, ip, &offset_fsb, + end_fsb); + } + if (error) { + trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); + goto out_cancel; + } + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +out_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* Compute the largest atomic write that we can complete through software. */ +xfs_extlen_t +xfs_reflink_max_atomic_cow( + struct xfs_mount *mp) +{ + /* We cannot do any atomic writes without out of place writes. */ + if (!xfs_can_sw_atomic_write(mp)) + return 0; + + /* + * Atomic write limits must always be a power-of-2, according to + * generic_atomic_write_valid. + */ + return rounddown_pow_of_two(xfs_calc_max_atomic_write_fsblocks(mp)); +} + /* * Free all CoW staging blocks that are still referenced by the ondisk refcount * metadata. The ondisk metadata does not track which inode created the diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 65c5dfe17ecf7fc95c6863a121f68545bc9acd70..937f38a6b75b0e68b3ea12d1f3bdd6d3c7ff9e32 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -26,6 +26,8 @@ int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool convert_now); extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); +int xfs_reflink_convert_cow_locked(struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, xfs_filblks_t count_fsb); extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip, struct xfs_trans **tpp, xfs_fileoff_t offset_fsb, @@ -34,6 +36,8 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count, bool cancel_real); extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); +int xfs_reflink_end_atomic_cow(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t count); extern int xfs_reflink_recover_cow(struct xfs_mount *mp); extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, @@ -52,5 +56,6 @@ extern int xfs_reflink_remap_blocks(struct xfs_inode *src, loff_t pos_in, loff_t *remapped); extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen, xfs_extlen_t cowextsize, unsigned int remap_flags); +xfs_extlen_t xfs_reflink_max_atomic_cow(struct xfs_mount *mp); #endif /* __XFS_REFLINK_H */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 5e8a02d2b045de5443bbc584de4532613b7ea339..ce7ec4e0e6cfca14049bb8576dfd1a7e5271fb55 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -74,6 +74,11 @@ xfs_rui_item_size( *nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents); } +unsigned int xfs_rui_log_space(unsigned int nr) +{ + return xlog_item_space(1, xfs_rui_log_format_sizeof(nr)); +} + /* * This is called to fill in the vector of log iovecs for the * given rui log item. We use only 1 iovec, and we point that @@ -171,6 +176,11 @@ xfs_rud_item_size( *nbytes += sizeof(struct xfs_rud_log_format); } +unsigned int xfs_rud_log_space(void) +{ + return xlog_item_space(1, sizeof(struct xfs_rud_log_format)); +} + /* * This is called to fill in the vector of log iovecs for the * given rud log item. We use only 1 iovec, and we point that diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h index 802e5119eacaab640dc4b4d25e955cc02aadd695..8789169c24dcb96f18b7e651dc565df903232801 100644 --- a/fs/xfs/xfs_rmap_item.h +++ b/fs/xfs/xfs_rmap_item.h @@ -71,4 +71,7 @@ struct xfs_rud_log_item { extern struct kmem_cache *xfs_rui_cache; extern struct kmem_cache *xfs_rud_cache; +unsigned int xfs_rui_log_space(unsigned int nr); +unsigned int xfs_rud_log_space(void); + #endif /* __XFS_RMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ca82472d2558b18f7b0776bcb0b02356bc2dfd53..906831f97d1ff1c92fca704aff2819500d6d1bdf 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -101,7 +101,7 @@ enum { Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, - Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, + Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_atomic_write, }; static const struct fs_parameter_spec xfs_fs_parameters[] = { @@ -146,6 +146,7 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = { fsparam_flag("nodiscard", Opt_nodiscard), fsparam_flag("dax", Opt_dax), fsparam_enum("dax", Opt_dax_enum, dax_param_enums), + fsparam_string("max_atomic_write", Opt_max_atomic_write), {} }; @@ -225,6 +226,9 @@ xfs_fs_show_options( if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) seq_puts(m, ",noquota"); + if (mp->m_awu_max_bytes) + seq_printf(m, ",max_atomic_write=%lluk", + mp->m_awu_max_bytes >> 10); return 0; } @@ -521,7 +525,7 @@ xfs_setup_devices( { int error; - error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize); + error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize); if (error) return error; @@ -530,13 +534,13 @@ xfs_setup_devices( if (xfs_has_sector(mp)) log_sector_size = mp->m_sb.sb_logsectsize; - error = xfs_setsize_buftarg(mp->m_logdev_targp, + error = xfs_configure_buftarg(mp->m_logdev_targp, log_sector_size); if (error) return error; } if (mp->m_rtdev_targp) { - error = xfs_setsize_buftarg(mp->m_rtdev_targp, + error = xfs_configure_buftarg(mp->m_rtdev_targp, mp->m_sb.sb_sectsize); if (error) return error; @@ -1235,6 +1239,42 @@ suffix_kstrtoint( return ret; } +static int +suffix_kstrtoull( + const char *s, + unsigned int base, + unsigned long long *res) +{ + int last, shift_left_factor = 0; + unsigned long long _res; + char *value; + int ret = 0; + + value = kstrdup(s, GFP_KERNEL); + if (!value) + return -ENOMEM; + + last = strlen(value) - 1; + if (value[last] == 'K' || value[last] == 'k') { + shift_left_factor = 10; + value[last] = '\0'; + } + if (value[last] == 'M' || value[last] == 'm') { + shift_left_factor = 20; + value[last] = '\0'; + } + if (value[last] == 'G' || value[last] == 'g') { + shift_left_factor = 30; + value[last] = '\0'; + } + + if (kstrtoull(value, base, &_res)) + ret = -EINVAL; + kfree(value); + *res = _res << shift_left_factor; + return ret; +} + static inline void xfs_fs_warn_deprecated( struct fs_context *fc, @@ -1401,6 +1441,14 @@ xfs_fs_parse_param( xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true); parsing_mp->m_features |= XFS_FEAT_NOATTR2; return 0; + case Opt_max_atomic_write: + if (suffix_kstrtoull(param->string, 10, + &parsing_mp->m_awu_max_bytes)) { + xfs_warn(parsing_mp, + "max atomic write size must be positive integer"); + return -EINVAL; + } + return 0; default: xfs_warn(parsing_mp, "unknown mount option [%s].", param->key); return -EINVAL; @@ -1939,6 +1987,14 @@ xfs_fs_reconfigure( if (error) return error; + /* Validate new max_atomic_write option before making other changes */ + if (mp->m_awu_max_bytes != new_mp->m_awu_max_bytes) { + error = xfs_set_max_atomic_write_opt(mp, + new_mp->m_awu_max_bytes); + if (error) + return error; + } + /* inode32 -> inode64 */ if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) { mp->m_features &= ~XFS_FEAT_SMALL_INUMS; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 3e85acda8cfc6a6cf4f33e21a8a4b320b95c8ac8..179b81e2766205110d3b9bae518356fc68d66a23 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -144,6 +144,92 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound); DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list); DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list); +TRACE_EVENT(xfs_calc_atomic_write_unit_max, + TP_PROTO(struct xfs_mount *mp, unsigned int max_write, + unsigned int max_ioend, unsigned int max_agsize), + TP_ARGS(mp, max_write, max_ioend, max_agsize), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, max_write) + __field(unsigned int, max_ioend) + __field(unsigned int, max_agsize) + __field(unsigned int, data_awu_max) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->max_write = max_write; + __entry->max_ioend = max_ioend; + __entry->max_agsize = max_agsize; + __entry->data_awu_max = mp->m_awu_max; + ), + TP_printk("dev %d:%d max_write %u max_ioend %u max_agsize %u data_awu_max %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->max_write, + __entry->max_ioend, + __entry->max_agsize, + __entry->data_awu_max) +); + +TRACE_EVENT(xfs_calc_max_atomic_write_fsblocks, + TP_PROTO(struct xfs_mount *mp, unsigned int per_intent, + unsigned int step_size, unsigned int logres, + unsigned int blockcount), + TP_ARGS(mp, per_intent, step_size, logres, blockcount), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, per_intent) + __field(unsigned int, step_size) + __field(unsigned int, logres) + __field(unsigned int, blockcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->per_intent = per_intent; + __entry->step_size = step_size; + __entry->logres = logres; + __entry->blockcount = blockcount; + ), + TP_printk("dev %d:%d per_intent %u step_size %u logres %u blockcount %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->per_intent, + __entry->step_size, + __entry->logres, + __entry->blockcount) +); + +TRACE_EVENT(xfs_calc_max_atomic_write_log_geometry, + TP_PROTO(struct xfs_mount *mp, unsigned int per_intent, + unsigned int step_size, unsigned int blockcount, + unsigned int min_logblocks, unsigned int logres), + TP_ARGS(mp, per_intent, step_size, blockcount, min_logblocks, logres), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, per_intent) + __field(unsigned int, step_size) + __field(unsigned int, blockcount) + __field(unsigned int, min_logblocks) + __field(unsigned int, cur_logblocks) + __field(unsigned int, logres) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->per_intent = per_intent; + __entry->step_size = step_size; + __entry->blockcount = blockcount; + __entry->min_logblocks = min_logblocks; + __entry->cur_logblocks = mp->m_sb.sb_logblocks; + __entry->logres = logres; + ), + TP_printk("dev %d:%d per_intent %u step_size %u blockcount %u min_logblocks %u logblocks %u logres %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->per_intent, + __entry->step_size, + __entry->blockcount, + __entry->min_logblocks, + __entry->cur_logblocks, + __entry->logres) +); + TRACE_EVENT(xlog_intent_recovery_failed, TP_PROTO(struct xfs_mount *mp, int error, void *function), TP_ARGS(mp, error, function), @@ -1435,6 +1521,28 @@ DEFINE_RW_EVENT(xfs_file_direct_write); DEFINE_RW_EVENT(xfs_file_dax_write); DEFINE_RW_EVENT(xfs_reflink_bounce_dio_write); +TRACE_EVENT(xfs_iomap_atomic_write_cow, + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), + TP_ARGS(ip, offset, count), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_off_t, offset) + __field(ssize_t, count) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->offset = offset; + __entry->count = count; + ), + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx bytecount 0x%zx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->offset, + __entry->count) +) + DECLARE_EVENT_CLASS(xfs_imap_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, int whichfork, struct xfs_bmbt_irec *irec), diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1f8b62f663a1f367efda1181150cde13ebef67f6..2d6d6f6ed0ee0fbc4a4659297e66bcdb9c62fd9f 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -195,6 +195,11 @@ typedef u16 blk_short_t; */ #define BLK_STS_DURATION_LIMIT ((__force blk_status_t)18) +/* + * Invalid size or alignment. + */ +#define BLK_STS_INVAL ((__force blk_status_t)19) + /** * blk_path_error - returns true if error may be path related * @error: status the request was completed with @@ -426,7 +431,7 @@ enum req_flag_bits { __REQ_SWAP, /* swap I/O */ __REQ_DRV, /* for driver use */ __REQ_FS_PRIVATE, /* for file system (submitter) use */ - + __REQ_ATOMIC, /* for atomic write operations */ /* * Command specific flags, keep last: */ @@ -458,6 +463,7 @@ enum req_flag_bits { #define REQ_SWAP (__force blk_opf_t)(1ULL << __REQ_SWAP) #define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) #define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) +#define REQ_ATOMIC (__force blk_opf_t)(1ULL << __REQ_ATOMIC) #define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 32f9f1cc7d3c1f09a8d66671664ca67c3d600afd..43e4711da122025267ab54d5a41bf61269fc1c32 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -343,6 +343,16 @@ struct queue_limits { unsigned int discard_alignment; unsigned int zone_write_granularity; + /* atomic write limits */ + unsigned int atomic_write_hw_max; + unsigned int atomic_write_max_sectors; + unsigned int atomic_write_hw_boundary; + unsigned int atomic_write_boundary_sectors; + unsigned int atomic_write_hw_unit_min; + unsigned int atomic_write_unit_min; + unsigned int atomic_write_hw_unit_max; + unsigned int atomic_write_unit_max; + unsigned short max_segments; unsigned short max_integrity_segments; unsigned short max_discard_segments; @@ -941,14 +951,15 @@ static inline unsigned int bio_zone_is_seq(struct bio *bio) } /* - * Return how much of the chunk is left to be used for I/O at a given offset. + * Return how much within the boundary is left to be used for I/O at a given + * offset. */ -static inline unsigned int blk_chunk_sectors_left(sector_t offset, - unsigned int chunk_sectors) +static inline unsigned int blk_boundary_sectors_left(sector_t offset, + unsigned int boundary_sectors) { - if (unlikely(!is_power_of_2(chunk_sectors))) - return chunk_sectors - sector_div(offset, chunk_sectors); - return chunk_sectors - (offset & (chunk_sectors - 1)); + if (unlikely(!is_power_of_2(boundary_sectors))) + return boundary_sectors - sector_div(offset, boundary_sectors); + return boundary_sectors - (offset & (boundary_sectors - 1)); } /* @@ -1399,6 +1410,30 @@ static inline int queue_dma_alignment(const struct request_queue *q) return q ? q->limits.dma_alignment : 511; } +static inline unsigned int +queue_atomic_write_unit_max_bytes(const struct request_queue *q) +{ + return q->limits.atomic_write_unit_max; +} + +static inline unsigned int +queue_atomic_write_unit_min_bytes(const struct request_queue *q) +{ + return q->limits.atomic_write_unit_min; +} + +static inline unsigned int +queue_atomic_write_boundary_bytes(const struct request_queue *q) +{ + return q->limits.atomic_write_boundary_sectors << SECTOR_SHIFT; +} + +static inline unsigned int +queue_atomic_write_max_bytes(const struct request_queue *q) +{ + return q->limits.atomic_write_max_sectors << SECTOR_SHIFT; +} + static inline unsigned int bdev_dma_alignment(struct block_device *bdev) { return queue_dma_alignment(bdev_get_queue(bdev)); @@ -1649,6 +1684,43 @@ struct io_comp_batch { KABI_RESERVE(1) }; +static inline bool bdev_can_atomic_write(struct block_device *bdev) +{ + struct request_queue *bd_queue = bdev->bd_queue; + struct queue_limits *limits = &bd_queue->limits; + + if (!limits->atomic_write_unit_min) + return false; + + if (bdev_is_partition(bdev)) { + sector_t bd_start_sect = bdev->bd_start_sect; + unsigned int alignment = + max(limits->atomic_write_unit_min, + limits->atomic_write_hw_boundary); + + if (!IS_ALIGNED(bd_start_sect, alignment >> SECTOR_SHIFT)) + return false; + } + + return true; +} + +static inline unsigned int +bdev_atomic_write_unit_min_bytes(struct block_device *bdev) +{ + if (!bdev_can_atomic_write(bdev)) + return 0; + return queue_atomic_write_unit_min_bytes(bdev_get_queue(bdev)); +} + +static inline unsigned int +bdev_atomic_write_unit_max_bytes(struct block_device *bdev) +{ + if (!bdev_can_atomic_write(bdev)) + return 0; + return queue_atomic_write_unit_max_bytes(bdev_get_queue(bdev)); +} + #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } #endif /* _LINUX_BLKDEV_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 8dd49c400ec0d22ed8fbd8c89fc26d89d8a30f21..bf71382ffa0aa78a4adb05f8d8d032b37b0f9db0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -127,6 +127,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_PWRITE ((__force fmode_t)0x10) /* File is opened for execution with sys_execve / sys_uselib */ #define FMODE_EXEC ((__force fmode_t)0x20) +/* File supports atomic writes */ +#define FMODE_CAN_ATOMIC_WRITE ((__force fmode_t)(0x80)) /* 32bit hashes as llseek() offset (for directories) */ #define FMODE_32BITHASH ((__force fmode_t)0x200) /* 64bit hashes as llseek() offset (for directories) */ @@ -347,6 +349,7 @@ enum rw_hint { #define IOCB_SYNC (__force int) RWF_SYNC #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND +#define IOCB_ATOMIC (__force int) RWF_ATOMIC /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) @@ -381,6 +384,7 @@ enum rw_hint { { IOCB_SYNC, "SYNC" }, \ { IOCB_NOWAIT, "NOWAIT" }, \ { IOCB_APPEND, "APPEND" }, \ + { IOCB_ATOMIC, "ATOMIC"}, \ { IOCB_EVENTFD, "EVENTFD"}, \ { IOCB_DIRECT, "DIRECT" }, \ { IOCB_WRITE, "WRITE" }, \ @@ -3228,6 +3232,10 @@ extern const struct inode_operations page_symlink_inode_operations; extern void kfree_link(void *); void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *); void generic_fill_statx_attr(struct inode *inode, struct kstat *stat); +void generic_fill_statx_atomic_writes(struct kstat *stat, + unsigned int unit_min, + unsigned int unit_max, + unsigned int unit_max_opt); extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int); extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int); void __inode_add_bytes(struct inode *inode, loff_t bytes); @@ -3409,7 +3417,8 @@ static inline int iocb_flags(struct file *file) return res; } -static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) +static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, + int rw_type) { int kiocb_flags = 0; @@ -3426,6 +3435,12 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) return -EOPNOTSUPP; kiocb_flags |= IOCB_NOIO; } + if (flags & RWF_ATOMIC) { + if (rw_type != WRITE) + return -EOPNOTSUPP; + if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) + return -EOPNOTSUPP; + } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; @@ -3658,4 +3673,6 @@ static inline bool cachefiles_ondemand_is_enabled(void) } #endif +int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter); + #endif /* _LINUX_FS_H */ diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 6bd7ed98cf1f45cc5e11de09a2462790cf47a0a7..406c47bb9e319b31c8c9df3add9cf60d47f0ef2a 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -53,6 +53,9 @@ struct vm_fault; * * IOMAP_F_XATTR indicates that the iomap is for an extended attribute extent * rather than a file data extent. + * + * IOMAP_F_ATOMIC_BIO indicates that (write) I/O will be issued as an atomic + * bio, i.e. set REQ_ATOMIC. */ #define IOMAP_F_NEW (1U << 0) #define IOMAP_F_DIRTY (1U << 1) @@ -64,6 +67,7 @@ struct vm_fault; #define IOMAP_F_BUFFER_HEAD 0 #endif /* CONFIG_BUFFER_HEAD */ #define IOMAP_F_XATTR (1U << 5) +#define IOMAP_F_ATOMIC_BIO (1U << 6) /* * Flags set by the core iomap code during operations: @@ -178,6 +182,7 @@ struct iomap_folio_ops { #else #define IOMAP_DAX 0 #endif /* CONFIG_FS_DAX */ +#define IOMAP_ATOMIC (1 << 9) /* torn-write protection */ struct iomap_ops { /* diff --git a/include/linux/stat.h b/include/linux/stat.h index 045064370cd84dc44827e651ffc3ac08e3d5480a..b06d170eef09c9831353b632ca8f28c9efe3d0b3 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -54,6 +54,10 @@ struct kstat { u32 dio_mem_align; u32 dio_offset_align; u64 change_cookie; + u32 atomic_write_unit_min; + u32 atomic_write_unit_max; + u32 atomic_write_unit_max_opt; + u32 atomic_write_segments_max; KABI_RESERVE(1) KABI_RESERVE(2) diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index b7b56871029c58148cace2e383249cd193e29151..d88bb0c3a048b7cc0b7f5dda4189af73068f1718 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -301,8 +301,11 @@ typedef int __bitwise __kernel_rwf_t; /* per-IO O_APPEND */ #define RWF_APPEND ((__force __kernel_rwf_t)0x00000010) +/* Atomic Write */ +#define RWF_ATOMIC ((__force __kernel_rwf_t)0x00000040) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND) + RWF_APPEND | RWF_ATOMIC) #endif /* _UAPI_LINUX_FS_H */ diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index 7cab2c65d3d7fce9210d2fb6d02012233b9923cf..44f10d925cff79d89da1d76a021d0e6b94d403cf 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -127,7 +127,16 @@ struct statx { __u32 stx_dio_mem_align; /* Memory buffer alignment for direct I/O */ __u32 stx_dio_offset_align; /* File offset alignment for direct I/O */ /* 0xa0 */ - __u64 __spare3[12]; /* Spare space for future expansion */ + __u64 stx_subvol; /* Subvolume identifier */ + __u32 stx_atomic_write_unit_min; /* Min atomic write unit in bytes */ + __u32 stx_atomic_write_unit_max; /* Max atomic write unit in bytes */ + /* 0xb0 */ + __u32 stx_atomic_write_segments_max; /* Max atomic write segment count */ + __u32 stx_dio_read_offset_align; /* File offset alignment for direct I/O reads */ + __u32 stx_atomic_write_unit_max_opt; /* Optimised max atomic write unit in bytes */ + __u32 __spare2[1]; + /* 0xb8 */ + __u64 __spare3[8]; /* Spare space for future expansion */ /* 0x100 */ }; @@ -154,6 +163,7 @@ struct statx { #define STATX_BTIME 0x00000800U /* Want/got stx_btime */ #define STATX_MNT_ID 0x00001000U /* Got stx_mnt_id */ #define STATX_DIOALIGN 0x00002000U /* Want/got direct I/O alignment info */ +#define STATX_WRITE_ATOMIC 0x00010000U /* Want/got atomic_write_* fields */ #define STATX__RESERVED 0x80000000U /* Reserved for future struct statx expansion */ @@ -189,6 +199,7 @@ struct statx { #define STATX_ATTR_MOUNT_ROOT 0x00002000 /* Root of a mount */ #define STATX_ATTR_VERITY 0x00100000 /* [I] Verity protected file */ #define STATX_ATTR_DAX 0x00200000 /* File is currently in DAX state */ +#define STATX_ATTR_WRITE_ATOMIC 0x00400000 /* File supports atomic write operations */ #endif /* _UAPI_LINUX_STAT_H */ diff --git a/io_uring/rw.c b/io_uring/rw.c index 75b001febb4d284c556aee784003b49f5758ddbb..d6190532f64b28fbc996ac42eb3003eec52d904e 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -660,7 +660,7 @@ static bool need_complete_io(struct io_kiocb *req) S_ISBLK(file_inode(req->file)->i_mode); } -static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) +static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct kiocb *kiocb = &rw->kiocb; @@ -675,7 +675,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) req->flags |= io_file_get_flags(file); kiocb->ki_flags = file->f_iocb_flags; - ret = kiocb_set_rw_flags(kiocb, rw->flags); + ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type); if (unlikely(ret)) return ret; kiocb->ki_flags |= IOCB_ALLOC_CACHE; @@ -743,7 +743,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) iov_iter_restore(&s->iter, &s->iter_state); iovec = NULL; } - ret = io_rw_init_file(req, FMODE_READ); + ret = io_rw_init_file(req, FMODE_READ, READ); if (unlikely(ret)) { kfree(iovec); return ret; @@ -918,7 +918,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) iov_iter_restore(&s->iter, &s->iter_state); iovec = NULL; } - ret = io_rw_init_file(req, FMODE_WRITE); + ret = io_rw_init_file(req, FMODE_WRITE, WRITE); if (unlikely(ret)) { kfree(iovec); return ret;