diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 69919e14b9db..6978d643a22d 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -448,17 +448,26 @@ io_uring_disabled Prevents all processes from creating new io_uring instances. Enabling this shrinks the kernel's attack surface. -= ================================================================== -0 All processes can create io_uring instances as normal. This is the - default setting. -1 io_uring creation is disabled for unprivileged processes. - io_uring_setup fails with -EPERM unless the calling process is - privileged (CAP_SYS_ADMIN). Existing io_uring instances can - still be used. -2 io_uring creation is disabled for all processes. io_uring_setup += ====================================================================== +0 All processes can create io_uring instances as normal. +1 io_uring creation is disabled (io_uring_setup() will fail with + -EPERM) for unprivileged processes not in the io_uring_group group. + Existing io_uring instances can still be used. See the + documentation for io_uring_group for more information. +2 io_uring creation is disabled for all processes. io_uring_setup() always fails with -EPERM. Existing io_uring instances can still be - used. -= ================================================================== + used. This is the default setting. += ====================================================================== + + +io_uring_group +============== + +When io_uring_disabled is set to 1, a process must either be +privileged (CAP_SYS_ADMIN) or be in the io_uring_group group in order +to create an io_uring instance. If io_uring_group is set to -1 (the +default), only processes with the CAP_SYS_ADMIN capability may create +io_uring instances. kexec_load_disabled diff --git a/MAINTAINERS b/MAINTAINERS index 9a927aff2258..efe733f830ad 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10086,7 +10086,6 @@ F: io_uring/ F: include/linux/io_uring.h F: include/linux/io_uring_types.h F: include/uapi/linux/io_uring.h -F: tools/io_uring/ IPMI SUBSYSTEM M: Corey Minyard diff --git a/block/blk-map.c b/block/blk-map.c index 04e591ae4a23..83741ea39fae 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -29,9 +29,11 @@ static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data, bmd = kmalloc(struct_size(bmd, iov, data->nr_segs), gfp_mask); if (!bmd) return NULL; - memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs); bmd->iter = *data; - bmd->iter.iov = bmd->iov; + if (iter_is_iovec(data)) { + memcpy(bmd->iov, iter_iov(data), sizeof(struct iovec) * data->nr_segs); + bmd->iter.__iov = bmd->iov; + } return bmd; } @@ -636,7 +638,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, copy = true; else if (iov_iter_is_bvec(iter)) map_bvec = true; - else if (!iter_is_iovec(iter)) + else if (!user_backed_iter(iter)) copy = true; else if (queue_virt_boundary(q)) copy = queue_virt_boundary(q) & iov_iter_gap_alignment(iter); @@ -677,9 +679,8 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, void __user *ubuf, unsigned long len, gfp_t gfp_mask) { - struct iovec iov; struct iov_iter i; - int ret = import_single_range(rq_data_dir(rq), ubuf, len, &iov, &i); + int ret = import_ubuf(rq_data_dir(rq), ubuf, len, &i); if (unlikely(ret < 0)) return ret; diff --git a/block/fops.c b/block/fops.c index cbcd41c271d2..6992aca7adc9 100644 --- a/block/fops.c +++ b/block/fops.c @@ -506,7 +506,7 @@ static int blkdev_open(struct inode *inode, struct file *filp) * during an unstable branch. */ filp->f_flags |= O_LARGEFILE; - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + filp->f_mode |= FMODE_BUF_RASYNC; /* * Use the file private data to store the holder for exclusive openes. @@ -520,6 +520,9 @@ static int blkdev_open(struct inode *inode, struct file *filp) if (IS_ERR(bdev)) return PTR_ERR(bdev); + if (bdev_nowait(bdev)) + filp->f_mode |= FMODE_NOWAIT; + filp->f_mapping = bdev->bd_inode->i_mapping; filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); return 0; diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 2b03d83198d2..f75aee8cf432 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -362,7 +362,7 @@ static unsigned zero_mmap_capabilities(struct file *file) /* can't do an in-place private mapping if there's no MMU */ static inline int private_mapping_ok(struct vm_area_struct *vma) { - return vma->vm_flags & VM_MAYSHARE; + return is_nommu_shared_mapping(vma->vm_flags); } #else diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index b163d84dafbb..459206bb7f2a 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -284,11 +284,12 @@ static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from) } while (dim) { + const struct iovec *iov = iter_iov(from); int ret; unsigned long count = 0; ret = hfi1_user_sdma_process_request( - fd, (struct iovec *)(from->iov + done), + fd, (struct iovec *)(iov + done), dim, &count); if (ret) { reqs = ret; diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index af2e30e35321..c9ee9060ce2b 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -2246,10 +2246,10 @@ static ssize_t qib_write_iter(struct kiocb *iocb, struct iov_iter *from) struct qib_ctxtdata *rcd = ctxt_fp(iocb->ki_filp); struct qib_user_sdma_queue *pq = fp->pq; - if (!iter_is_iovec(from) || !from->nr_segs || !pq) + if (!from->user_backed || !from->nr_segs || !pq) return -EINVAL; - return qib_user_sdma_writev(rcd, pq, from->iov, from->nr_segs); + return qib_user_sdma_writev(rcd, pq, iter_iov(from), from->nr_segs); } static struct class *qib_class; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index adf879e94526..a85c448b7a20 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1473,7 +1473,8 @@ static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile, skb->truesize += skb->data_len; for (i = 1; i < it->nr_segs; i++) { - size_t fragsz = it->iov[i].iov_len; + const struct iovec *iov = iter_iov(it); + size_t fragsz = iov->iov_len; struct page *page; void *frag; diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 777650c34236..73622e5c24df 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -551,7 +551,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec) { struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); - const struct nvme_uring_cmd *cmd = ioucmd->cmd; + const struct nvme_uring_cmd *cmd = io_uring_sqe_cmd(ioucmd->sqe); struct request_queue *q = ns ? ns->queue : ctrl->admin_q; struct nvme_uring_data d; struct nvme_command c; diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 98dc2291e9a1..c91fe09810ef 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -1246,7 +1246,7 @@ static ssize_t ffs_epfile_read_iter(struct kiocb *kiocb, struct iov_iter *to) p->kiocb = kiocb; if (p->aio) { p->to_free = dup_iter(&p->data, to, GFP_KERNEL); - if (!p->to_free) { + if (!iter_is_ubuf(&p->data) && !p->to_free) { kfree(p); return -ENOMEM; } diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index 01c3ead7d1b4..a77e0b29e5e5 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -613,7 +613,7 @@ ep_read_iter(struct kiocb *iocb, struct iov_iter *to) if (!priv) goto fail; priv->to_free = dup_iter(&priv->to, to, GFP_KERNEL); - if (!priv->to_free) { + if (!iter_is_ubuf(&priv->to) && !priv->to_free) { kfree(priv); goto fail; } diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index dd2348437621..a3bde36f8477 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -641,7 +641,7 @@ vhost_scsi_calc_sgls(struct iov_iter *iter, size_t bytes, int max_sgls) { int sgl_count = 0; - if (!iter || !iter->iov) { + if (!iter || !iter_iov(iter)) { pr_err("%s: iter->iov is NULL, but expected bytes: %zu" " present\n", __func__, bytes); return -EINVAL; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9f9c21a1c422..459f60273333 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3621,10 +3621,15 @@ static int check_direct_read(struct btrfs_fs_info *fs_info, if (!iter_is_iovec(iter)) return 0; - for (seg = 0; seg < iter->nr_segs; seg++) - for (i = seg + 1; i < iter->nr_segs; i++) - if (iter->iov[seg].iov_base == iter->iov[i].iov_base) + for (seg = 0; seg < iter->nr_segs; seg++) { + for (i = seg + 1; i < iter->nr_segs; i++) { + const struct iovec *iov1 = iter_iov(iter) + seg; + const struct iovec *iov2 = iter_iov(iter) + i; + + if (iov1->iov_base == iov2->iov_base) return -EINVAL; + } + } return 0; } diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 666aa380011e..b19c2e4b91da 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -446,7 +446,7 @@ bailout: static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) { - return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS; + return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -ENOSYS; } static unsigned long cramfs_physmem_get_unmapped_area(struct file *file, diff --git a/fs/exec.c b/fs/exec.c index 90e6838f6ff0..1fdab0ffd561 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -153,8 +153,6 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) path_noexec(&file->f_path))) goto exit; - fsnotify_open(file); - error = -ENOEXEC; read_lock(&binfmt_lock); @@ -939,9 +937,6 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) if (err) goto exit; - if (name->name[0] != '\0') - fsnotify_open(file); - out: return file; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 90018b39bd22..1366a16d4fbb 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -902,7 +902,8 @@ static int ext4_file_open(struct inode *inode, struct file *filp) return ret; } - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | + FMODE_DIO_PARALLEL_WRITE; return dquot_file_open(inode, filp); } diff --git a/fs/fhandle.c b/fs/fhandle.c index d3cd8354a1c5..3075cd704264 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -236,7 +236,6 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, retval = PTR_ERR(file); } else { retval = fd; - fsnotify_open(file); fd_install(fd, file); } path_put(&path); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f2aac74acbdb..ceb50463e4bb 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1370,7 +1370,7 @@ out: static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) { - return (unsigned long)ii->iov->iov_base + ii->iov_offset; + return (unsigned long)iter_iov(ii)->iov_base + ii->iov_offset; } static inline size_t fuse_get_frag_size(const struct iov_iter *ii, diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index a1af1c4e58af..0361f3df375b 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -19,10 +19,12 @@ * Private flags for iomap_dio, must not overlap with the public ones in * iomap.h: */ -#define IOMAP_DIO_WRITE_FUA (1 << 28) -#define IOMAP_DIO_NEED_SYNC (1 << 29) -#define IOMAP_DIO_WRITE (1 << 30) -#define IOMAP_DIO_DIRTY (1 << 31) +#define IOMAP_DIO_CALLER_COMP (1U << 26) +#define IOMAP_DIO_INLINE_COMP (1U << 27) +#define IOMAP_DIO_WRITE_THROUGH (1U << 28) +#define IOMAP_DIO_NEED_SYNC (1U << 29) +#define IOMAP_DIO_WRITE (1U << 30) +#define IOMAP_DIO_DIRTY (1U << 31) struct iomap_dio { struct kiocb *iocb; @@ -40,7 +42,6 @@ struct iomap_dio { struct { struct iov_iter *iter; struct task_struct *waiter; - struct bio *poll_bio; } submit; /* used for aio completion: */ @@ -53,12 +54,14 @@ struct iomap_dio { static void iomap_dio_submit_bio(const struct iomap_iter *iter, struct iomap_dio *dio, struct bio *bio, loff_t pos) { + struct kiocb *iocb = dio->iocb; + atomic_inc(&dio->ref); /* Sync dio can't be polled reliably */ - if ((dio->iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(dio->iocb)) { - bio_set_polled(bio, dio->iocb); - dio->submit.poll_bio = bio; + if ((iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(iocb)) { + bio_set_polled(bio, iocb); + WRITE_ONCE(iocb->private, bio); } if (dio->dops && dio->dops->submit_io) @@ -126,6 +129,11 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) } EXPORT_SYMBOL_GPL(iomap_dio_complete); +static ssize_t iomap_dio_deferred_complete(void *data) +{ + return iomap_dio_complete(data); +} + static void iomap_dio_complete_work(struct work_struct *work) { struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); @@ -148,27 +156,69 @@ static void iomap_dio_bio_end_io(struct bio *bio) { struct iomap_dio *dio = bio->bi_private; bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + struct kiocb *iocb = dio->iocb; if (bio->bi_status) iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); + if (!atomic_dec_and_test(&dio->ref)) + goto release_bio; - if (atomic_dec_and_test(&dio->ref)) { - if (dio->wait_for_completion) { - struct task_struct *waiter = dio->submit.waiter; - WRITE_ONCE(dio->submit.waiter, NULL); - blk_wake_io_task(waiter); - } else if (dio->flags & IOMAP_DIO_WRITE) { - struct inode *inode = file_inode(dio->iocb->ki_filp); + /* + * Synchronous dio, task itself will handle any completion work + * that needs after IO. All we need to do is wake the task. + */ + if (dio->wait_for_completion) { + struct task_struct *waiter = dio->submit.waiter; - WRITE_ONCE(dio->iocb->private, NULL); - INIT_WORK(&dio->aio.work, iomap_dio_complete_work); - queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); - } else { - WRITE_ONCE(dio->iocb->private, NULL); - iomap_dio_complete_work(&dio->aio.work); - } + WRITE_ONCE(dio->submit.waiter, NULL); + blk_wake_io_task(waiter); + goto release_bio; } + /* + * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline + */ + if (dio->flags & IOMAP_DIO_INLINE_COMP) { + WRITE_ONCE(iocb->private, NULL); + iomap_dio_complete_work(&dio->aio.work); + goto release_bio; + } + + /* + * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule + * our completion that way to avoid an async punt to a workqueue. + */ + if (dio->flags & IOMAP_DIO_CALLER_COMP) { + /* only polled IO cares about private cleared */ + iocb->private = dio; + iocb->dio_complete = iomap_dio_deferred_complete; + + /* + * Invoke ->ki_complete() directly. We've assigned our + * dio_complete callback handler, and since the issuer set + * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will + * notice ->dio_complete being set and will defer calling that + * handler until it can be done from a safe task context. + * + * Note that the 'res' being passed in here is not important + * for this case. The actual completion value of the request + * will be gotten from dio_complete when that is run by the + * issuer. + */ + iocb->ki_complete(iocb, 0); + goto release_bio; + } + + /* + * Async DIO completion that requires filesystem level completion work + * gets punted to a work queue to complete as the operation may require + * more IO to be issued to finalise filesystem metadata changes or + * guarantee data integrity. + */ + INIT_WORK(&dio->aio.work, iomap_dio_complete_work); + queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq, + &dio->aio.work); +release_bio: if (should_dirty) { bio_check_pages_dirty(bio); } else { @@ -197,7 +247,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, /* * Figure out the bio's operation flags from the dio request, the * mapping, and whether or not we want FUA. Note that we can end up - * clearing the WRITE_FUA flag in the dio request. + * clearing the WRITE_THROUGH flag in the dio request. */ static inline unsigned int iomap_dio_bio_opflags(struct iomap_dio *dio, const struct iomap *iomap, bool use_fua) @@ -217,7 +267,7 @@ static inline unsigned int iomap_dio_bio_opflags(struct iomap_dio *dio, if (use_fua) opflags |= REQ_FUA; else - dio->flags &= ~IOMAP_DIO_WRITE_FUA; + dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; return opflags; } @@ -258,12 +308,19 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, * Use a FUA write if we need datasync semantics, this is a pure * data IO that doesn't require any metadata updates (including * after IO completion such as unwritten extent conversion) and - * the underlying device supports FUA. This allows us to avoid - * cache flushes on IO completion. + * the underlying device either supports FUA or doesn't have + * a volatile write cache. This allows us to avoid cache flushes + * on IO completion. If we can't use writethrough and need to + * sync, disable in-task completions as dio completion will + * need to call generic_write_sync() which will do a blocking + * fsync / cache flush call. */ if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && - (dio->flags & IOMAP_DIO_WRITE_FUA) && bdev_fua(iomap->bdev)) + (dio->flags & IOMAP_DIO_WRITE_THROUGH) && + (bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev))) use_fua = true; + else if (dio->flags & IOMAP_DIO_NEED_SYNC) + dio->flags &= ~IOMAP_DIO_CALLER_COMP; } /* @@ -278,10 +335,23 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, goto out; /* - * We can only poll for single bio I/Os. + * We can only do deferred completion for pure overwrites that + * don't require additional IO at completion. This rules out + * writes that need zeroing or extent conversion, extend + * the file size, or issue journal IO or cache flushes + * during completion processing. */ if (need_zeroout || + ((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) || ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) + dio->flags &= ~IOMAP_DIO_CALLER_COMP; + + /* + * The rules for polled IO completions follow the guidelines as the + * ones we set for inline and deferred completions. If none of those + * are available for this IO, clear the polled flag. + */ + if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP))) dio->iocb->ki_flags &= ~IOCB_HIPRI; if (need_zeroout) { @@ -502,9 +572,11 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->submit.iter = iter; dio->submit.waiter = current; - dio->submit.poll_bio = NULL; if (iov_iter_rw(iter) == READ) { + /* reads can always complete inline */ + dio->flags |= IOMAP_DIO_INLINE_COMP; + if (iomi.pos >= dio->i_size) goto out_free_dio; @@ -523,6 +595,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, iomi.flags |= IOMAP_WRITE; dio->flags |= IOMAP_DIO_WRITE; + /* + * Flag as supporting deferred completions, if the issuer + * groks it. This can avoid a workqueue punt for writes. + * We may later clear this flag if we need to do other IO + * as part of this IO completion. + */ + if (iocb->ki_flags & IOCB_DIO_CALLER_COMP) + dio->flags |= IOMAP_DIO_CALLER_COMP; + if (iocb->ki_flags & IOCB_NOWAIT) { if (filemap_range_has_page(mapping, iomi.pos, end)) { ret = -EAGAIN; @@ -536,13 +617,16 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->flags |= IOMAP_DIO_NEED_SYNC; /* - * For datasync only writes, we optimistically try using FUA for - * this IO. Any non-FUA write that occurs will clear this flag, - * hence we know before completion whether a cache flush is - * necessary. + * For datasync only writes, we optimistically try using + * WRITE_THROUGH for this IO. This flag requires either + * FUA writes through the device's write cache, or a + * normal write to a device without a volatile write + * cache. For the former, Any non-FUA write that occurs + * will clear this flag, hence we know before completion + * whether a cache flush is necessary. */ if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC) - dio->flags |= IOMAP_DIO_WRITE_FUA; + dio->flags |= IOMAP_DIO_WRITE_THROUGH; } if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { @@ -615,14 +699,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, iomap_dio_set_error(dio, ret); /* - * If all the writes we issued were FUA, we don't need to flush the - * cache on IO completion. Clear the sync flag for this case. + * If all the writes we issued were already written through to the + * media, we don't need to flush the cache on IO completion. Clear the + * sync flag for this case. */ - if (dio->flags & IOMAP_DIO_WRITE_FUA) + if (dio->flags & IOMAP_DIO_WRITE_THROUGH) dio->flags &= ~IOMAP_DIO_NEED_SYNC; - WRITE_ONCE(iocb->private, dio->submit.poll_bio); - /* * We are about to drop our additional submission reference, which * might be the last reference to the dio. There are three different diff --git a/fs/open.c b/fs/open.c index d9e087a49add..6f3b13ef1ead 100644 --- a/fs/open.c +++ b/fs/open.c @@ -886,6 +886,11 @@ static int do_dentry_open(struct file *f, truncate_pagecache(inode, 0); } + /* + * Once we return a file with FMODE_OPENED, __fput() will call + * fsnotify_close(), so we need fsnotify_open() here for symmetry. + */ + fsnotify_open(f); return 0; cleanup_all: @@ -1270,7 +1275,6 @@ static long do_sys_openat2(int dfd, const char __user *filename, put_unused_fd(fd); fd = PTR_ERR(f); } else { - fsnotify_open(f); fd_install(fd, f); } } diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index a534acfd6423..7a71e7050797 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -390,6 +390,12 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (!ovl_should_sync(OVL_FS(inode->i_sb))) ifl &= ~(IOCB_DSYNC | IOCB_SYNC); + /* + * Overlayfs doesn't support deferred completions, don't copy + * this property in case it is set by the issuer. + */ + ifl &= ~IOCB_DIO_CALLER_COMP; + old_cred = ovl_override_creds(file_inode(file)->i_sb); if (is_sync_kiocb(iocb)) { file_start_write(real.file); diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index a6d21fc0033c..d9c6cd8ed0d0 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -40,7 +40,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) } if (atomic_read(&mm->mm_count) > 1 || - vma->vm_flags & VM_MAYSHARE) { + is_nommu_shared_mapping(vma->vm_flags)) { sbytes += size; } else { bytes += size; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index c614ffdb28be..f0a3fcec1b6b 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -264,7 +264,7 @@ out: */ static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma) { - if (!(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) + if (!is_nommu_shared_mapping(vma->vm_flags)) return -ENOSYS; file_accessed(file); diff --git a/fs/read_write.c b/fs/read_write.c index b4f231e4948a..0407005760d7 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -749,15 +749,14 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, return -EOPNOTSUPP; while (iov_iter_count(iter)) { - struct iovec iovec = iov_iter_iovec(iter); ssize_t nr; if (type == READ) { - nr = filp->f_op->read(filp, iovec.iov_base, - iovec.iov_len, ppos); + nr = filp->f_op->read(filp, iter_iov_addr(iter), + iter_iov_len(iter), ppos); } else { - nr = filp->f_op->write(filp, iovec.iov_base, - iovec.iov_len, ppos); + nr = filp->f_op->write(filp, iter_iov_addr(iter), + iter_iov_len(iter), ppos); } if (nr < 0) { @@ -766,7 +765,7 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, break; } ret += nr; - if (nr != iovec.iov_len) + if (nr != iter_iov_len(iter)) break; iov_iter_advance(iter, nr); } diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c index 2c4a23113fb5..4578dc45e50a 100644 --- a/fs/romfs/mmap-nommu.c +++ b/fs/romfs/mmap-nommu.c @@ -63,7 +63,7 @@ static unsigned long romfs_get_unmapped_area(struct file *file, */ static int romfs_mmap(struct file *file, struct vm_area_struct *vma) { - return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS; + return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -ENOSYS; } static unsigned romfs_mmap_capabilities(struct file *file) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 4d409400e504..9a57da209801 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1171,7 +1171,8 @@ xfs_file_open( { if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; - file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC; + file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC | + FMODE_DIO_PARALLEL_WRITE; return generic_file_open(inode, file); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 16a5092ae629..c4f078448397 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -159,6 +159,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File supports DIRECT IO */ #define FMODE_CAN_ODIRECT ((__force fmode_t)0x400000) +/* File supports non-exclusive O_DIRECT writes from multiple threads */ +#define FMODE_DIO_PARALLEL_WRITE ((__force fmode_t)0x1000000) + /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) @@ -312,20 +315,60 @@ enum rw_hint { #define IOCB_NOIO (1 << 20) /* can use bio alloc cache */ #define IOCB_ALLOC_CACHE (1 << 21) +/* + * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the + * iocb completion can be passed back to the owner for execution from a safe + * context rather than needing to be punted through a workqueue. If this + * flag is set, the bio completion handling may set iocb->dio_complete to a + * handler function and iocb->private to context information for that handler. + * The issuer should call the handler with that context information from task + * context to complete the processing of the iocb. Note that while this + * provides a task context for the dio_complete() callback, it should only be + * used on the completion side for non-IO generating completions. It's fine to + * call blocking functions from this callback, but they should not wait for + * unrelated IO (like cache flushing, new IO generation, etc). + */ +#define IOCB_DIO_CALLER_COMP (1 << 22) + +/* for use in trace events */ +#define TRACE_IOCB_STRINGS \ + { IOCB_HIPRI, "HIPRI" }, \ + { IOCB_DSYNC, "DSYNC" }, \ + { IOCB_SYNC, "SYNC" }, \ + { IOCB_NOWAIT, "NOWAIT" }, \ + { IOCB_APPEND, "APPEND" }, \ + { IOCB_EVENTFD, "EVENTFD"}, \ + { IOCB_DIRECT, "DIRECT" }, \ + { IOCB_WRITE, "WRITE" }, \ + { IOCB_WAITQ, "WAITQ" }, \ + { IOCB_NOIO, "NOIO" }, \ + { IOCB_ALLOC_CACHE, "ALLOC_CACHE" }, \ + { IOCB_DIO_CALLER_COMP, "CALLER_COMP" } struct kiocb { struct file *ki_filp; - - /* The 'ki_filp' pointer is shared in a union for aio */ - randomized_struct_fields_start - loff_t ki_pos; void (*ki_complete)(struct kiocb *iocb, long ret); void *private; int ki_flags; u16 ki_ioprio; /* See linux/ioprio.h */ - struct wait_page_queue *ki_waitq; /* for async buffered IO */ - randomized_struct_fields_end + union { + /* + * Only used for async buffered reads, where it denotes the + * page waitqueue associated with completing the read. Valid + * IFF IOCB_WAITQ is set. + */ + struct wait_page_queue *ki_waitq; + /* + * Can be used for O_DIRECT IO, where the completion handling + * is punted back to the issuer of the IO. May only be set + * if IOCB_DIO_CALLER_COMP is set by the issuer, and the issuer + * must then check for presence of this handler when ki_complete + * is invoked. The data passed in to this handler must be + * assigned to ->private when dio_complete is assigned. + */ + ssize_t (*dio_complete)(void *data); + }; }; static inline bool is_sync_kiocb(struct kiocb *kiocb) diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 35b9328ca335..106cdc55ff3b 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -24,7 +24,7 @@ enum io_uring_cmd_flags { struct io_uring_cmd { struct file *file; - const void *cmd; + const struct io_uring_sqe *sqe; union { /* callback to defer completions to task context */ void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned); @@ -36,18 +36,33 @@ struct io_uring_cmd { u8 pdu[32]; /* available inline for free use */ }; +static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) +{ + return sqe->cmd; +} + #if defined(CONFIG_IO_URING) int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd); void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2, unsigned issue_flags); -void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)); struct sock *io_uring_get_socket(struct file *file); void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); void io_uring_unreg_ringfd(void); const char *io_uring_get_opcode(u8 opcode); +void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned), + unsigned flags); +/* users should follow semantics of IOU_F_TWQ_LAZY_WAKE */ +void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)); + +static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +{ + __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); +} static inline void io_uring_files_cancel(void) { @@ -66,6 +81,7 @@ static inline void io_uring_free(struct task_struct *tsk) if (tsk->io_uring) __io_uring_free(tsk); } +int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd) @@ -80,6 +96,10 @@ static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, void (*task_work_cb)(struct io_uring_cmd *, unsigned)) { } +static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +{ +} static inline struct sock *io_uring_get_socket(struct file *file) { return NULL; @@ -97,6 +117,11 @@ static inline const char *io_uring_get_opcode(u8 opcode) { return ""; } +static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + return -EOPNOTSUPP; +} #endif #endif diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 128a67a40065..13d19b9be9f4 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -58,7 +58,7 @@ struct io_uring_task { struct xarray xa; struct wait_queue_head wait; - atomic_t in_idle; + atomic_t in_cancel; atomic_t inflight_tracked; struct percpu_counter inflight; @@ -69,8 +69,8 @@ struct io_uring_task { }; struct io_uring { - u32 head ____cacheline_aligned_in_smp; - u32 tail ____cacheline_aligned_in_smp; + u32 head; + u32 tail; }; /* @@ -176,7 +176,6 @@ struct io_submit_state { unsigned short submit_nr; unsigned int cqes_count; struct blk_plug plug; - struct io_uring_cqe cqes[16]; }; struct io_ev_fd { @@ -188,28 +187,34 @@ struct io_ev_fd { }; struct io_alloc_cache { - struct hlist_head list; + struct io_wq_work_node list; unsigned int nr_cached; + unsigned int max_cached; + size_t elem_size; }; struct io_ring_ctx { /* const or read-mostly hot data */ struct { - struct percpu_ref refs; - - struct io_rings *rings; unsigned int flags; - enum task_work_notify_mode notify_method; - unsigned int compat: 1; unsigned int drain_next: 1; unsigned int restricted: 1; unsigned int off_timeout_used: 1; unsigned int drain_active: 1; - unsigned int drain_disabled: 1; unsigned int has_evfd: 1; - unsigned int syscall_iopoll: 1; /* all CQEs should be posted only by the submitter task */ unsigned int task_complete: 1; + unsigned int lockless_cq: 1; + unsigned int syscall_iopoll: 1; + unsigned int poll_activated: 1; + unsigned int drain_disabled: 1; + unsigned int compat: 1; + + struct task_struct *submitter_task; + struct io_rings *rings; + struct percpu_ref refs; + + enum task_work_notify_mode notify_method; } ____cacheline_aligned_in_smp; /* submission data */ @@ -237,7 +242,6 @@ struct io_ring_ctx { * uring_lock, and updated through io_uring_register(2) */ struct io_rsrc_node *rsrc_node; - int rsrc_cached_refs; atomic_t cancel_seq; struct io_file_table file_table; unsigned nr_user_files; @@ -248,32 +252,21 @@ struct io_ring_ctx { struct io_buffer_list *io_bl; struct xarray io_bl_xa; - struct list_head io_buffers_cache; struct io_hash_table cancel_table_locked; - struct list_head cq_overflow_list; struct io_alloc_cache apoll_cache; struct io_alloc_cache netmsg_cache; + + /* + * ->iopoll_list is protected by the ctx->uring_lock for + * io_uring instances that don't use IORING_SETUP_SQPOLL. + * For SQPOLL, only the single threaded io_sq_thread() will + * manipulate the list, hence no extra locking is needed there. + */ + struct io_wq_work_list iopoll_list; + bool poll_multi_queue; } ____cacheline_aligned_in_smp; - /* IRQ completion list, under ->completion_lock */ - struct io_wq_work_list locked_free_list; - unsigned int locked_free_nr; - - const struct cred *sq_creds; /* cred used for __io_sq_thread() */ - struct io_sq_data *sq_data; /* if using sq thread polling */ - - struct wait_queue_head sqo_sq_wait; - struct list_head sqd_list; - - unsigned long check_cq; - - unsigned int file_alloc_start; - unsigned int file_alloc_end; - - struct xarray personalities; - u32 pers_next; - struct { /* * We cache a range of free CQEs we can use, once exhausted it @@ -285,54 +278,69 @@ struct io_ring_ctx { unsigned cached_cq_tail; unsigned cq_entries; struct io_ev_fd __rcu *io_ev_fd; - struct wait_queue_head cq_wait; unsigned cq_extra; } ____cacheline_aligned_in_smp; + /* + * task_work and async notification delivery cacheline. Expected to + * regularly bounce b/w CPUs. + */ struct { - spinlock_t completion_lock; - - bool poll_multi_queue; - - /* - * ->iopoll_list is protected by the ctx->uring_lock for - * io_uring instances that don't use IORING_SETUP_SQPOLL. - * For SQPOLL, only the single threaded io_sq_thread() will - * manipulate the list, hence no extra locking is needed there. - */ - struct io_wq_work_list iopoll_list; - struct io_hash_table cancel_table; - struct llist_head work_llist; - - struct list_head io_buffers_comp; + unsigned long check_cq; + atomic_t cq_wait_nr; + atomic_t cq_timeouts; + struct wait_queue_head cq_wait; } ____cacheline_aligned_in_smp; /* timeouts */ struct { spinlock_t timeout_lock; - atomic_t cq_timeouts; struct list_head timeout_list; struct list_head ltimeout_list; unsigned cq_last_tm_flush; } ____cacheline_aligned_in_smp; - /* Keep this last, we don't need it for the fast path */ + struct io_uring_cqe completion_cqes[16]; + spinlock_t completion_lock; + + /* IRQ completion list, under ->completion_lock */ + struct io_wq_work_list locked_free_list; + unsigned int locked_free_nr; + + struct list_head io_buffers_comp; + struct list_head cq_overflow_list; + struct io_hash_table cancel_table; + + const struct cred *sq_creds; /* cred used for __io_sq_thread() */ + struct io_sq_data *sq_data; /* if using sq thread polling */ + + struct wait_queue_head sqo_sq_wait; + struct list_head sqd_list; + + unsigned int file_alloc_start; + unsigned int file_alloc_end; + + struct xarray personalities; + u32 pers_next; + + struct list_head io_buffers_cache; + + /* Keep this last, we don't need it for the fast path */ + struct wait_queue_head poll_wq; struct io_restriction restrictions; - struct task_struct *submitter_task; /* slow path rsrc auxilary data, used by update/register */ - struct io_rsrc_node *rsrc_backup_node; struct io_mapped_ubuf *dummy_ubuf; struct io_rsrc_data *file_data; struct io_rsrc_data *buf_data; - struct delayed_work rsrc_put_work; - struct callback_head rsrc_put_tw; - struct llist_head rsrc_put_llist; + /* protected by ->uring_lock */ struct list_head rsrc_ref_list; - spinlock_t rsrc_ref_lock; + struct io_alloc_cache rsrc_node_cache; + struct wait_queue_head rsrc_quiesce_wq; + unsigned rsrc_quiesce; struct list_head io_buffers_pages; @@ -357,10 +365,25 @@ struct io_ring_ctx { u32 iowq_limits[2]; bool iowq_limits_set; + struct callback_head poll_wq_task_work; struct list_head defer_list; unsigned sq_thread_idle; /* protected by ->completion_lock */ unsigned evfd_last_cq_tail; + + /* + * If IORING_SETUP_NO_MMAP is used, then the below holds + * the gup'ed pages for the two rings, and the sqes. + */ + unsigned short n_ring_pages; + unsigned short n_sqe_pages; + struct page **ring_pages; + struct page **sqe_pages; +}; + +struct io_tw_state { + /* ->uring_lock is taken, callbacks can use io_tw_lock to lock it */ + bool locked; }; enum { @@ -391,7 +414,6 @@ enum { REQ_F_SINGLE_POLL_BIT, REQ_F_DOUBLE_POLL_BIT, REQ_F_PARTIAL_IO_BIT, - REQ_F_CQE32_INIT_BIT, REQ_F_APOLL_MULTISHOT_BIT, REQ_F_CLEAR_POLLIN_BIT, REQ_F_HASH_LOCKED_BIT, @@ -461,15 +483,13 @@ enum { REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), /* fast poll multishot mode */ REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), - /* ->extra1 and ->extra2 are initialised */ - REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT), /* recvmsg special flag, clear EPOLLIN */ REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), /* hashed into ->cancel_hash_locked, protected by ->uring_lock */ REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT), }; -typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); +typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); struct io_task_work { struct llist_node node; @@ -559,14 +579,9 @@ struct io_kiocb { atomic_t refs; atomic_t poll_refs; struct io_task_work io_task_work; + unsigned nr_tw; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ - union { - struct hlist_node hash_node; - struct { - u64 extra1; - u64 extra2; - }; - }; + struct hlist_node hash_node; /* internal polling, see IORING_FEAT_FAST_POLL */ struct async_poll *apoll; /* opcode allocated if it needs to store data for async defer */ @@ -576,6 +591,11 @@ struct io_kiocb { /* custom credentials, valid IFF REQ_F_CREDS is set */ const struct cred *creds; struct io_wq_work work; + + struct { + u64 extra1; + u64 extra2; + } big_cqe; }; struct io_overflow_cqe { diff --git a/include/linux/mm.h b/include/linux/mm.h index 740c6512d8ba..cb3ddad99889 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1279,6 +1279,21 @@ static inline bool is_cow_mapping(vm_flags_t flags) return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } +#ifndef CONFIG_MMU +static inline bool is_nommu_shared_mapping(vm_flags_t flags) +{ + /* + * NOMMU shared mappings are ordinary MAP_SHARED mappings and selected + * R/O MAP_PRIVATE file mappings that are an effective R/O overlay of + * a file mapping. R/O MAP_PRIVATE mappings might still modify + * underlying memory if ptrace is active, so this is only possible if + * ptrace does not apply. Note that there is no mprotect() to upgrade + * write permissions later. + */ + return flags & VM_MAYSHARE; +} +#endif + #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif diff --git a/include/linux/mroute.h b/include/linux/mroute.h index 80b8400ab8b2..4c5003afee6c 100644 --- a/include/linux/mroute.h +++ b/include/linux/mroute.h @@ -18,10 +18,11 @@ static inline int ip_mroute_opt(int opt) int ip_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int); int ip_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t); -int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg); +int ipmr_ioctl(struct sock *sk, int cmd, void *arg); int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); int ip_mr_init(void); bool ipmr_rule_default(const struct fib_rule *rule); +int ipmr_sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); #else static inline int ip_mroute_setsockopt(struct sock *sock, int optname, sockptr_t optval, unsigned int optlen) @@ -35,7 +36,7 @@ static inline int ip_mroute_getsockopt(struct sock *sk, int optname, return -ENOPROTOOPT; } -static inline int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) +static inline int ipmr_ioctl(struct sock *sk, int cmd, void *arg) { return -ENOIOCTLCMD; } @@ -54,6 +55,12 @@ static inline bool ipmr_rule_default(const struct fib_rule *rule) { return true; } + +static inline int ipmr_sk_ioctl(struct sock *sk, unsigned int cmd, + void __user *arg) +{ + return 1; +} #endif #define VIFF_STATIC 0x8000 diff --git a/include/linux/mroute6.h b/include/linux/mroute6.h index 8f2b307fb124..63ef5191cc57 100644 --- a/include/linux/mroute6.h +++ b/include/linux/mroute6.h @@ -29,10 +29,10 @@ struct sock; extern int ip6_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int); extern int ip6_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t); extern int ip6_mr_input(struct sk_buff *skb); -extern int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg); extern int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); extern int ip6_mr_init(void); extern void ip6_mr_cleanup(void); +int ip6mr_ioctl(struct sock *sk, int cmd, void *arg); #else static inline int ip6_mroute_setsockopt(struct sock *sock, int optname, sockptr_t optval, unsigned int optlen) @@ -48,7 +48,7 @@ int ip6_mroute_getsockopt(struct sock *sock, } static inline -int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) +int ip6mr_ioctl(struct sock *sk, int cmd, void *arg) { return -ENOIOCTLCMD; } @@ -100,6 +100,27 @@ extern int ip6mr_get_route(struct net *net, struct sk_buff *skb, #ifdef CONFIG_IPV6_MROUTE bool mroute6_is_socket(struct net *net, struct sk_buff *skb); extern int ip6mr_sk_done(struct sock *sk); +static inline int ip6mr_sk_ioctl(struct sock *sk, unsigned int cmd, + void __user *arg) +{ + switch (cmd) { + /* These userspace buffers will be consumed by ip6mr_ioctl() */ + case SIOCGETMIFCNT_IN6: { + struct sioc_mif_req6 buffer; + + return sock_ioctl_inout(sk, cmd, arg, &buffer, + sizeof(buffer)); + } + case SIOCGETSGCNT_IN6: { + struct sioc_sg_req6 buffer; + + return sock_ioctl_inout(sk, cmd, arg, &buffer, + sizeof(buffer)); + } + } + + return 1; +} #else static inline bool mroute6_is_socket(struct net *net, struct sk_buff *skb) { @@ -109,5 +130,11 @@ static inline int ip6mr_sk_done(struct sock *sk) { return 0; } + +static inline int ip6mr_sk_ioctl(struct sock *sk, unsigned int cmd, + void __user *arg) +{ + return 1; +} #endif #endif diff --git a/include/linux/socket.h b/include/linux/socket.h index f36497969091..43b7d154148f 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -320,6 +320,7 @@ struct ucred { */ #define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ +#define MSG_SPLICE_PAGES 0x8000000 /* Splice the pages from the iterator in sendmsg() */ #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ #define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file descriptor received through @@ -330,6 +331,8 @@ struct ucred { #define MSG_CMSG_COMPAT 0 /* We never have 32 bit fixups */ #endif +/* Flags to be cleared on entry by sendmsg and sendmmsg syscalls */ +#define MSG_INTERNAL_SENDMSG_FLAGS (MSG_SPLICE_PAGES) /* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */ #define SOL_IP 0 diff --git a/include/linux/uio.h b/include/linux/uio.h index 0507aae67d45..1cdc951aff1b 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -49,7 +49,8 @@ struct iov_iter { }; size_t count; union { - const struct iovec *iov; + /* use iter_iov() to get the current vec */ + const struct iovec *__iov; const struct kvec *kvec; const struct bio_vec *bvec; struct xarray *xarray; @@ -66,6 +67,10 @@ struct iov_iter { }; }; +#define iter_iov(iter) (iter)->__iov +#define iter_iov_addr(iter) (iter_iov(iter)->iov_base + (iter)->iov_offset) +#define iter_iov_len(iter) (iter_iov(iter)->iov_len - (iter)->iov_offset) + static inline enum iter_type iov_iter_type(const struct iov_iter *i) { return i->iter_type; @@ -141,15 +146,6 @@ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs) return ret; } -static inline struct iovec iov_iter_iovec(const struct iov_iter *iter) -{ - return (struct iovec) { - .iov_base = iter->iov->iov_base + iter->iov_offset, - .iov_len = min(iter->count, - iter->iov->iov_len - iter->iov_offset), - }; -} - size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes, struct iov_iter *i); void iov_iter_advance(struct iov_iter *i, size_t bytes); @@ -343,6 +339,7 @@ ssize_t __import_iovec(int type, const struct iovec __user *uvec, struct iov_iter *i, bool compat); int import_single_range(int type, void __user *buf, size_t len, struct iovec *iov, struct iov_iter *i); +int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i); static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction, void __user *buf, size_t count) diff --git a/include/net/phonet/phonet.h b/include/net/phonet/phonet.h index 862f1719b523..cf5ecae4a2fc 100644 --- a/include/net/phonet/phonet.h +++ b/include/net/phonet/phonet.h @@ -109,4 +109,25 @@ void phonet_sysctl_exit(void); int isi_register(void); void isi_unregister(void); +static inline bool sk_is_phonet(struct sock *sk) +{ + return sk->sk_family == PF_PHONET; +} + +static inline int phonet_sk_ioctl(struct sock *sk, unsigned int cmd, + void __user *arg) +{ + int karg; + + switch (cmd) { + case SIOCPNADDRESOURCE: + case SIOCPNDELRESOURCE: + if (get_user(karg, (int __user *)arg)) + return -EFAULT; + + return sk->sk_prot->ioctl(sk, cmd, &karg); + } + /* A positive return value means that the ioctl was not processed */ + return 1; +} #endif diff --git a/include/net/sock.h b/include/net/sock.h index c53d031a9e8a..d067cc9fa585 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1228,7 +1228,7 @@ struct proto { bool kern); int (*ioctl)(struct sock *sk, int cmd, - unsigned long arg); + int *karg); int (*init)(struct sock *sk); void (*destroy)(struct sock *sk); void (*shutdown)(struct sock *sk, int how); @@ -2972,6 +2972,9 @@ int sock_get_timeout(long timeo, void *optval, bool old_timeval); int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, sockptr_t optval, int optlen, bool old_timeval); +int sock_ioctl_inout(struct sock *sk, unsigned int cmd, + void __user *arg, void *karg, size_t size); +int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); static inline bool sk_is_readable(struct sock *sk) { if (sk->sk_prot->sock_is_readable) diff --git a/include/net/tcp.h b/include/net/tcp.h index b58cc605a1a3..6a2f039ee206 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -342,7 +342,7 @@ void tcp_release_cb(struct sock *sk); void tcp_wfree(struct sk_buff *skb); void tcp_write_timer_handler(struct sock *sk); void tcp_delack_timer_handler(struct sock *sk); -int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg); +int tcp_ioctl(struct sock *sk, int cmd, int *karg); int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); void tcp_rcv_space_adjust(struct sock *sk); diff --git a/include/net/udp.h b/include/net/udp.h index ed5db1f3703c..b718bc5bb00b 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -284,7 +284,7 @@ void udp_flush_pending_frames(struct sock *sk); int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size); void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); int udp_rcv(struct sk_buff *skb); -int udp_ioctl(struct sock *sk, int cmd, unsigned long arg); +int udp_ioctl(struct sock *sk, int cmd, int *karg); int udp_init_sock(struct sock *sk); int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int __udp_disconnect(struct sock *sk, int flags); diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 936fd41bf147..69454f1f98b0 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -360,19 +360,18 @@ TRACE_EVENT(io_uring_complete, ); /** - * io_uring_submit_sqe - called before submitting one SQE + * io_uring_submit_req - called before submitting a request * * @req: pointer to a submitted request - * @force_nonblock: whether a context blocking or not * * Allows to track SQE submitting, to understand what was the source of it, SQ * thread or io_uring_enter call. */ -TRACE_EVENT(io_uring_submit_sqe, +TRACE_EVENT(io_uring_submit_req, - TP_PROTO(struct io_kiocb *req, bool force_nonblock), + TP_PROTO(struct io_kiocb *req), - TP_ARGS(req, force_nonblock), + TP_ARGS(req), TP_STRUCT__entry ( __field( void *, ctx ) @@ -380,7 +379,6 @@ TRACE_EVENT(io_uring_submit_sqe, __field( unsigned long long, user_data ) __field( u8, opcode ) __field( u32, flags ) - __field( bool, force_nonblock ) __field( bool, sq_thread ) __string( op_str, io_uring_get_opcode(req->opcode) ) @@ -392,16 +390,15 @@ TRACE_EVENT(io_uring_submit_sqe, __entry->user_data = req->cqe.user_data; __entry->opcode = req->opcode; __entry->flags = req->flags; - __entry->force_nonblock = force_nonblock; __entry->sq_thread = req->ctx->flags & IORING_SETUP_SQPOLL; __assign_str(op_str, io_uring_get_opcode(req->opcode)); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, " - "non block %d, sq_thread %d", __entry->ctx, __entry->req, + "sq_thread %d", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), - __entry->flags, __entry->force_nonblock, __entry->sq_thread) + __entry->flags, __entry->sq_thread) ); /* diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 2371ace04abe..8e61f8b7c2ce 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -173,6 +173,23 @@ enum { */ #define IORING_SETUP_DEFER_TASKRUN (1U << 13) +/* + * Application provides the memory for the rings + */ +#define IORING_SETUP_NO_MMAP (1U << 14) + +/* + * Register the ring fd in itself for use with + * IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather + * than an fd. + */ +#define IORING_SETUP_REGISTERED_FD_ONLY (1U << 15) + +/* + * Removes indirection through the SQ index array. + */ +#define IORING_SETUP_NO_SQARRAY (1U << 16) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, @@ -252,6 +269,7 @@ enum io_uring_op { #define IORING_TIMEOUT_REALTIME (1U << 3) #define IORING_LINK_TIMEOUT_UPDATE (1U << 4) #define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5) +#define IORING_TIMEOUT_MULTISHOT (1U << 6) #define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME) #define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE) /* @@ -286,11 +304,15 @@ enum io_uring_op { * request 'user_data' * IORING_ASYNC_CANCEL_ANY Match any request * IORING_ASYNC_CANCEL_FD_FIXED 'fd' passed in is a fixed descriptor + * IORING_ASYNC_CANCEL_USERDATA Match on user_data, default for no other key + * IORING_ASYNC_CANCEL_OP Match request based on opcode */ #define IORING_ASYNC_CANCEL_ALL (1U << 0) #define IORING_ASYNC_CANCEL_FD (1U << 1) #define IORING_ASYNC_CANCEL_ANY (1U << 2) #define IORING_ASYNC_CANCEL_FD_FIXED (1U << 3) +#define IORING_ASYNC_CANCEL_USERDATA (1U << 4) +#define IORING_ASYNC_CANCEL_OP (1U << 5) /* * send/sendmsg and recv/recvmsg flags (sqe->ioprio) @@ -349,6 +371,8 @@ enum { * applicable for IORING_MSG_DATA, obviously. */ #define IORING_MSG_RING_CQE_SKIP (1U << 0) +/* Pass through the flags from sqe->file_index to cqe->flags */ +#define IORING_MSG_RING_FLAGS_PASS (1U << 1) /* * IO completion data structure (Completion Queue Entry) @@ -389,6 +413,9 @@ enum { #define IORING_OFF_SQ_RING 0ULL #define IORING_OFF_CQ_RING 0x8000000ULL #define IORING_OFF_SQES 0x10000000ULL +#define IORING_OFF_PBUF_RING 0x80000000ULL +#define IORING_OFF_PBUF_SHIFT 16 +#define IORING_OFF_MMAP_MASK 0xf8000000ULL /* * Filled with the offset for mmap(2) @@ -402,7 +429,7 @@ struct io_sqring_offsets { __u32 dropped; __u32 array; __u32 resv1; - __u64 resv2; + __u64 user_addr; }; /* @@ -421,7 +448,7 @@ struct io_cqring_offsets { __u32 cqes; __u32 flags; __u32 resv1; - __u64 resv2; + __u64 user_addr; }; /* @@ -472,6 +499,7 @@ struct io_uring_params { #define IORING_FEAT_RSRC_TAGS (1U << 10) #define IORING_FEAT_CQE_SKIP (1U << 11) #define IORING_FEAT_LINKED_FILE (1U << 12) +#define IORING_FEAT_REG_REG_RING (1U << 13) /* * io_uring_register(2) opcodes and arguments @@ -519,7 +547,10 @@ enum { IORING_REGISTER_FILE_ALLOC_RANGE = 25, /* this goes last */ - IORING_REGISTER_LAST + IORING_REGISTER_LAST, + + /* flag added to the opcode to use a registered ring fd */ + IORING_REGISTER_USE_REGISTERED_RING = 1U << 31 }; /* io-wq worker categories */ @@ -564,19 +595,6 @@ struct io_uring_rsrc_update2 { __u32 resv2; }; -struct io_uring_notification_slot { - __u64 tag; - __u64 resv[3]; -}; - -struct io_uring_notification_register { - __u32 nr_slots; - __u32 resv; - __u64 resv2; - __u64 data; - __u64 resv3; -}; - /* Skip updating fd indexes set to this value in the fd table */ #define IORING_REGISTER_FILES_SKIP (-2) @@ -631,12 +649,26 @@ struct io_uring_buf_ring { }; }; +/* + * Flags for IORING_REGISTER_PBUF_RING. + * + * IOU_PBUF_RING_MMAP: If set, kernel will allocate the memory for the ring. + * The application must not set a ring_addr in struct + * io_uring_buf_reg, instead it must subsequently call + * mmap(2) with the offset set as: + * IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT) + * to get a virtual mapping for the ring. + */ +enum { + IOU_PBUF_RING_MMAP = 1, +}; + /* argument for IORING_(UN)REGISTER_PBUF_RING */ struct io_uring_buf_reg { __u64 ring_addr; __u32 ring_entries; __u16 bgid; - __u16 pad; + __u16 flags; __u64 resv[3]; }; @@ -674,7 +706,9 @@ struct io_uring_sync_cancel_reg { __s32 fd; __u32 flags; struct __kernel_timespec timeout; - __u64 pad[4]; + __u8 opcode; + __u8 pad[7]; + __u64 pad2[3]; }; /* @@ -694,6 +728,14 @@ struct io_uring_recvmsg_out { __u32 flags; }; +/* + * Argument for IORING_OP_URING_CMD when file is a socket + */ +enum { + SOCKET_URING_OP_SIOCINQ = 0, + SOCKET_URING_OP_SIOCOUTQ, +}; + #ifdef __cplusplus } #endif diff --git a/io_uring/advise.c b/io_uring/advise.c index 449c6f14649f..7085804c513c 100644 --- a/io_uring/advise.c +++ b/io_uring/advise.c @@ -39,6 +39,7 @@ int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) ma->addr = READ_ONCE(sqe->addr); ma->len = READ_ONCE(sqe->len); ma->advice = READ_ONCE(sqe->fadvise_advice); + req->flags |= REQ_F_FORCE_ASYNC; return 0; #else return -EOPNOTSUPP; @@ -51,8 +52,7 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags) struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); io_req_set_res(req, ret, 0); @@ -62,6 +62,18 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags) #endif } +static bool io_fadvise_force_async(struct io_fadvise *fa) +{ + switch (fa->advice) { + case POSIX_FADV_NORMAL: + case POSIX_FADV_RANDOM: + case POSIX_FADV_SEQUENTIAL: + return false; + default: + return true; + } +} + int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise); @@ -72,6 +84,8 @@ int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) fa->offset = READ_ONCE(sqe->off); fa->len = READ_ONCE(sqe->len); fa->advice = READ_ONCE(sqe->fadvise_advice); + if (io_fadvise_force_async(fa)) + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -80,16 +94,7 @@ int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) { - switch (fa->advice) { - case POSIX_FADV_NORMAL: - case POSIX_FADV_RANDOM: - case POSIX_FADV_SEQUENTIAL: - break; - default: - return -EAGAIN; - } - } + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK && io_fadvise_force_async(fa)); ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); if (ret < 0) diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index c2cde88aeed5..241245cb54a6 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -7,47 +7,60 @@ #define IO_ALLOC_CACHE_MAX 512 struct io_cache_entry { - struct hlist_node node; + struct io_wq_work_node node; }; static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, struct io_cache_entry *entry) { - if (cache->nr_cached < IO_ALLOC_CACHE_MAX) { + if (cache->nr_cached < cache->max_cached) { cache->nr_cached++; - hlist_add_head(&entry->node, &cache->list); + wq_stack_add_head(&entry->node, &cache->list); + /* KASAN poisons object */ + kasan_slab_free_mempool(entry); return true; } return false; } +static inline bool io_alloc_cache_empty(struct io_alloc_cache *cache) +{ + return !cache->list.next; +} + static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache) { - if (!hlist_empty(&cache->list)) { - struct hlist_node *node = cache->list.first; + if (cache->list.next) { + struct io_cache_entry *entry; - hlist_del(node); + entry = container_of(cache->list.next, struct io_cache_entry, node); + kasan_unpoison_range(entry, cache->elem_size); + cache->list.next = cache->list.next->next; cache->nr_cached--; - return container_of(node, struct io_cache_entry, node); + return entry; } return NULL; } -static inline void io_alloc_cache_init(struct io_alloc_cache *cache) +static inline void io_alloc_cache_init(struct io_alloc_cache *cache, + unsigned max_nr, size_t size) { - INIT_HLIST_HEAD(&cache->list); + cache->list.next = NULL; cache->nr_cached = 0; + cache->max_cached = max_nr; + cache->elem_size = size; } static inline void io_alloc_cache_free(struct io_alloc_cache *cache, void (*free)(struct io_cache_entry *)) { - while (!hlist_empty(&cache->list)) { - struct hlist_node *node = cache->list.first; + while (1) { + struct io_cache_entry *entry = io_alloc_cache_get(cache); - hlist_del(node); - free(container_of(node, struct io_cache_entry, node)); + if (!entry) + break; + free(entry); } cache->nr_cached = 0; } diff --git a/io_uring/cancel.c b/io_uring/cancel.c index b4f5dfacc0c3..7b23607cf4af 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -22,33 +22,54 @@ struct io_cancel { u64 addr; u32 flags; s32 fd; + u8 opcode; }; #define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \ - IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED) + IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED | \ + IORING_ASYNC_CANCEL_USERDATA | IORING_ASYNC_CANCEL_OP) + +/* + * Returns true if the request matches the criteria outlined by 'cd'. + */ +bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd) +{ + bool match_user_data = cd->flags & IORING_ASYNC_CANCEL_USERDATA; + + if (req->ctx != cd->ctx) + return false; + + if (!(cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP))) + match_user_data = true; + + if (cd->flags & IORING_ASYNC_CANCEL_ANY) + goto check_seq; + if (cd->flags & IORING_ASYNC_CANCEL_FD) { + if (req->file != cd->file) + return false; + } + if (cd->flags & IORING_ASYNC_CANCEL_OP) { + if (req->opcode != cd->opcode) + return false; + } + if (match_user_data && req->cqe.user_data != cd->data) + return false; + if (cd->flags & IORING_ASYNC_CANCEL_ALL) { +check_seq: + if (cd->seq == req->work.cancel_seq) + return false; + req->work.cancel_seq = cd->seq; + } + + return true; +} static bool io_cancel_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_cancel_data *cd = data; - if (req->ctx != cd->ctx) - return false; - if (cd->flags & IORING_ASYNC_CANCEL_ANY) { - ; - } else if (cd->flags & IORING_ASYNC_CANCEL_FD) { - if (req->file != cd->file) - return false; - } else { - if (req->cqe.user_data != cd->data) - return false; - } - if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { - if (cd->seq == req->work.cancel_seq) - return false; - req->work.cancel_seq = cd->seq; - } - return true; + return io_cancel_req_match(req, cd); } static int io_async_cancel_one(struct io_uring_task *tctx, @@ -111,7 +132,7 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; - if (sqe->off || sqe->len || sqe->splice_fd_in) + if (sqe->off || sqe->splice_fd_in) return -EINVAL; cancel->addr = READ_ONCE(sqe->addr); @@ -123,6 +144,11 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; cancel->fd = READ_ONCE(sqe->fd); } + if (cancel->flags & IORING_ASYNC_CANCEL_OP) { + if (cancel->flags & IORING_ASYNC_CANCEL_ANY) + return -EINVAL; + cancel->opcode = READ_ONCE(sqe->len); + } return 0; } @@ -169,6 +195,7 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) .ctx = req->ctx, .data = cancel->addr, .flags = cancel->flags, + .opcode = cancel->opcode, .seq = atomic_inc_return(&req->ctx->cancel_seq), }; struct io_uring_task *tctx = req->task->io_uring; @@ -216,13 +243,10 @@ static int __io_sync_cancel(struct io_uring_task *tctx, /* fixed must be grabbed every time since we drop the uring_lock */ if ((cd->flags & IORING_ASYNC_CANCEL_FD) && (cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) { - unsigned long file_ptr; - if (unlikely(fd >= ctx->nr_user_files)) return -EBADF; fd = array_index_nospec(fd, ctx->nr_user_files); - file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; - cd->file = (struct file *) (file_ptr & FFS_MASK); + cd->file = io_file_from_index(&ctx->file_table, fd); if (!cd->file) return -EBADF; } @@ -241,17 +265,22 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg) struct io_uring_sync_cancel_reg sc; struct fd f = { }; DEFINE_WAIT(wait); - int ret; + int ret, i; if (copy_from_user(&sc, arg, sizeof(sc))) return -EFAULT; if (sc.flags & ~CANCEL_FLAGS) return -EINVAL; - if (sc.pad[0] || sc.pad[1] || sc.pad[2] || sc.pad[3]) - return -EINVAL; + for (i = 0; i < ARRAY_SIZE(sc.pad); i++) + if (sc.pad[i]) + return -EINVAL; + for (i = 0; i < ARRAY_SIZE(sc.pad2); i++) + if (sc.pad2[i]) + return -EINVAL; cd.data = sc.addr; cd.flags = sc.flags; + cd.opcode = sc.opcode; /* we can grab a normal file descriptor upfront */ if ((cd.flags & IORING_ASYNC_CANCEL_FD) && diff --git a/io_uring/cancel.h b/io_uring/cancel.h index 6a59ee484d0c..fc98622e6166 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -8,11 +8,11 @@ struct io_cancel_data { u64 data; struct file *file; }; + u8 opcode; u32 flags; int seq; }; - int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags); @@ -21,3 +21,4 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, void init_hash_table(struct io_hash_table *table, unsigned size); int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg); +bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd); diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 882bd56b01ed..aa6913ce09d6 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -48,10 +48,13 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, return 0; } -static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, - struct seq_file *m) +/* + * Caller holds a reference to the file already, we don't need to do + * anything else to get an extra reference. + */ +__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) { - struct io_sq_data *sq = NULL; + struct io_ring_ctx *ctx = f->private_data; struct io_overflow_cqe *ocqe; struct io_rings *r = ctx->rings; unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1; @@ -62,6 +65,7 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, unsigned int cq_shift = 0; unsigned int sq_shift = 0; unsigned int sq_entries, cq_entries; + int sq_pid = -1, sq_cpu = -1; bool has_lock; unsigned int i; @@ -91,6 +95,8 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct io_uring_sqe *sqe; unsigned int sq_idx; + if (ctx->flags & IORING_SETUP_NO_SQARRAY) + break; sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); if (sq_idx > sq_mask) continue; @@ -139,13 +145,19 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, has_lock = mutex_trylock(&ctx->uring_lock); if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { - sq = ctx->sq_data; - if (!sq->thread) - sq = NULL; + struct io_sq_data *sq = ctx->sq_data; + + if (mutex_trylock(&sq->lock)) { + if (sq->thread) { + sq_pid = task_pid_nr(sq->thread); + sq_cpu = task_cpu(sq->thread); + } + mutex_unlock(&sq->lock); + } } - seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1); - seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1); + seq_printf(m, "SqThread:\t%d\n", sq_pid); + seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu); seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); for (i = 0; has_lock && i < ctx->nr_user_files; i++) { struct file *f = io_file_from_index(&ctx->file_table, i); @@ -205,14 +217,4 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, spin_unlock(&ctx->completion_lock); } - -__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) -{ - struct io_ring_ctx *ctx = f->private_data; - - if (percpu_ref_tryget(&ctx->refs)) { - __io_uring_show_fdinfo(ctx, m); - percpu_ref_put(&ctx->refs); - } -} #endif diff --git a/io_uring/filetable.c b/io_uring/filetable.c index b80614e7d605..e7d749991de4 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -64,7 +64,6 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, u32 slot_index) __must_hold(&req->ctx->uring_lock) { - bool needs_switch = false; struct io_fixed_file *file_slot; int ret; @@ -79,20 +78,13 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); if (file_slot->file_ptr) { - struct file *old_file; - - ret = io_rsrc_node_switch_start(ctx); - if (ret) - goto err; - - old_file = (struct file *)(file_slot->file_ptr & FFS_MASK); ret = io_queue_rsrc_removal(ctx->file_data, slot_index, - ctx->rsrc_node, old_file); + io_slot_file(file_slot)); if (ret) - goto err; + return ret; + file_slot->file_ptr = 0; io_file_bitmap_clear(&ctx->file_table, slot_index); - needs_switch = true; } ret = io_scm_file_account(ctx, file); @@ -101,9 +93,6 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, io_fixed_file_set(file_slot, file); io_file_bitmap_set(&ctx->file_table, slot_index); } -err: - if (needs_switch) - io_rsrc_node_switch(ctx, ctx->file_data); return ret; } @@ -149,30 +138,25 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset) { struct io_fixed_file *file_slot; - struct file *file; int ret; if (unlikely(!ctx->file_data)) return -ENXIO; if (offset >= ctx->nr_user_files) return -EINVAL; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - return ret; offset = array_index_nospec(offset, ctx->nr_user_files); file_slot = io_fixed_file_slot(&ctx->file_table, offset); if (!file_slot->file_ptr) return -EBADF; - file = (struct file *)(file_slot->file_ptr & FFS_MASK); - ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file); + ret = io_queue_rsrc_removal(ctx->file_data, offset, + io_slot_file(file_slot)); if (ret) return ret; file_slot->file_ptr = 0; io_file_bitmap_clear(&ctx->file_table, offset); - io_rsrc_node_switch(ctx, ctx->file_data); return 0; } diff --git a/io_uring/filetable.h b/io_uring/filetable.h index 351111ff8882..b47adf170c31 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -5,10 +5,6 @@ #include #include -#define FFS_NOWAIT 0x1UL -#define FFS_ISREG 0x2UL -#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG) - bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files); void io_free_file_tables(struct io_file_table *table); @@ -43,21 +39,31 @@ io_fixed_file_slot(struct io_file_table *table, unsigned i) return &table->files[i]; } +#define FFS_NOWAIT 0x1UL +#define FFS_ISREG 0x2UL +#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG) + +static inline unsigned int io_slot_flags(struct io_fixed_file *slot) +{ + return (slot->file_ptr & ~FFS_MASK) << REQ_F_SUPPORT_NOWAIT_BIT; +} + +static inline struct file *io_slot_file(struct io_fixed_file *slot) +{ + return (struct file *)(slot->file_ptr & FFS_MASK); +} + static inline struct file *io_file_from_index(struct io_file_table *table, int index) { - struct io_fixed_file *slot = io_fixed_file_slot(table, index); - - return (struct file *) (slot->file_ptr & FFS_MASK); + return io_slot_file(io_fixed_file_slot(table, index)); } static inline void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file) { - unsigned long file_ptr = (unsigned long) file; - - file_ptr |= io_file_get_flags(file); - file_slot->file_ptr = file_ptr; + file_slot->file_ptr = (unsigned long)file | + (io_file_get_flags(file) >> REQ_F_SUPPORT_NOWAIT_BIT); } static inline void io_reset_alloc_hint(struct io_ring_ctx *ctx) diff --git a/io_uring/fs.c b/io_uring/fs.c index 7100c293c13a..08e3b175469c 100644 --- a/io_uring/fs.c +++ b/io_uring/fs.c @@ -74,6 +74,7 @@ int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -82,8 +83,7 @@ int io_renameat(struct io_kiocb *req, unsigned int issue_flags) struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, ren->newpath, ren->flags); @@ -123,6 +123,7 @@ int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return PTR_ERR(un->filename); req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -131,8 +132,7 @@ int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) struct io_unlink *un = io_kiocb_to_cmd(req, struct io_unlink); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); if (un->flags & AT_REMOVEDIR) ret = do_rmdir(un->dfd, un->filename); @@ -170,6 +170,7 @@ int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return PTR_ERR(mkd->filename); req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -178,8 +179,7 @@ int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) struct io_mkdir *mkd = io_kiocb_to_cmd(req, struct io_mkdir); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); @@ -220,6 +220,7 @@ int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -228,8 +229,7 @@ int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) struct io_link *sl = io_kiocb_to_cmd(req, struct io_link); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath); @@ -243,7 +243,7 @@ int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_link *lnk = io_kiocb_to_cmd(req, struct io_link); const char __user *oldf, *newf; - if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -265,6 +265,7 @@ int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -273,8 +274,7 @@ int io_linkat(struct io_kiocb *req, unsigned int issue_flags) struct io_link *lnk = io_kiocb_to_cmd(req, struct io_link); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd, lnk->newpath, lnk->flags); diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 092a14a174ff..d66ca0438740 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "io-wq.h" @@ -39,7 +40,7 @@ enum { }; /* - * One for each thread in a wqe pool + * One for each thread in a wq pool */ struct io_worker { refcount_t ref; @@ -47,7 +48,7 @@ struct io_worker { struct hlist_nulls_node nulls_node; struct list_head all_list; struct task_struct *task; - struct io_wqe *wqe; + struct io_wq *wq; struct io_wq_work *cur_work; struct io_wq_work *next_work; @@ -73,7 +74,7 @@ struct io_worker { #define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER) -struct io_wqe_acct { +struct io_wq_acct { unsigned nr_workers; unsigned max_workers; int index; @@ -89,26 +90,6 @@ enum { IO_WQ_ACCT_NR, }; -/* - * Per-node worker thread pool - */ -struct io_wqe { - raw_spinlock_t lock; - struct io_wqe_acct acct[IO_WQ_ACCT_NR]; - - int node; - - struct hlist_nulls_head free_list; - struct list_head all_list; - - struct wait_queue_entry wait; - - struct io_wq *wq; - struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS]; - - cpumask_var_t cpu_mask; -}; - /* * Per io_wq state */ @@ -127,7 +108,19 @@ struct io_wq { struct task_struct *task; - struct io_wqe *wqes[]; + struct io_wq_acct acct[IO_WQ_ACCT_NR]; + + /* lock protects access to elements below */ + raw_spinlock_t lock; + + struct hlist_nulls_head free_list; + struct list_head all_list; + + struct wait_queue_entry wait; + + struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS]; + + cpumask_var_t cpu_mask; }; static enum cpuhp_state io_wq_online; @@ -140,10 +133,10 @@ struct io_cb_cancel_data { bool cancel_all; }; -static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index); -static void io_wqe_dec_running(struct io_worker *worker); -static bool io_acct_cancel_pending_work(struct io_wqe *wqe, - struct io_wqe_acct *acct, +static bool create_io_worker(struct io_wq *wq, int index); +static void io_wq_dec_running(struct io_worker *worker); +static bool io_acct_cancel_pending_work(struct io_wq *wq, + struct io_wq_acct *acct, struct io_cb_cancel_data *match); static void create_worker_cb(struct callback_head *cb); static void io_wq_cancel_tw_create(struct io_wq *wq); @@ -159,20 +152,20 @@ static void io_worker_release(struct io_worker *worker) complete(&worker->ref_done); } -static inline struct io_wqe_acct *io_get_acct(struct io_wqe *wqe, bool bound) +static inline struct io_wq_acct *io_get_acct(struct io_wq *wq, bool bound) { - return &wqe->acct[bound ? IO_WQ_ACCT_BOUND : IO_WQ_ACCT_UNBOUND]; + return &wq->acct[bound ? IO_WQ_ACCT_BOUND : IO_WQ_ACCT_UNBOUND]; } -static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe, - struct io_wq_work *work) +static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq, + struct io_wq_work *work) { - return io_get_acct(wqe, !(work->flags & IO_WQ_WORK_UNBOUND)); + return io_get_acct(wq, !(work->flags & IO_WQ_WORK_UNBOUND)); } -static inline struct io_wqe_acct *io_wqe_get_acct(struct io_worker *worker) +static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) { - return io_get_acct(worker->wqe, worker->flags & IO_WORKER_F_BOUND); + return io_get_acct(worker->wq, worker->flags & IO_WORKER_F_BOUND); } static void io_worker_ref_put(struct io_wq *wq) @@ -181,16 +174,25 @@ static void io_worker_ref_put(struct io_wq *wq) complete(&wq->worker_done); } +bool io_wq_worker_stopped(void) +{ + struct io_worker *worker = current->worker_private; + + if (WARN_ON_ONCE(!io_wq_current_is_worker())) + return true; + + return test_bit(IO_WQ_BIT_EXIT, &worker->wq->state); +} + static void io_worker_cancel_cb(struct io_worker *worker) { - struct io_wqe_acct *acct = io_wqe_get_acct(worker); - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; + struct io_wq_acct *acct = io_wq_get_acct(worker); + struct io_wq *wq = worker->wq; atomic_dec(&acct->nr_running); - raw_spin_lock(&worker->wqe->lock); + raw_spin_lock(&wq->lock); acct->nr_workers--; - raw_spin_unlock(&worker->wqe->lock); + raw_spin_unlock(&wq->lock); io_worker_ref_put(wq); clear_bit_unlock(0, &worker->create_state); io_worker_release(worker); @@ -208,8 +210,7 @@ static bool io_task_worker_match(struct callback_head *cb, void *data) static void io_worker_exit(struct io_worker *worker) { - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; + struct io_wq *wq = worker->wq; while (1) { struct callback_head *cb = task_work_cancel_match(wq->task, @@ -223,41 +224,51 @@ static void io_worker_exit(struct io_worker *worker) io_worker_release(worker); wait_for_completion(&worker->ref_done); - raw_spin_lock(&wqe->lock); + raw_spin_lock(&wq->lock); if (worker->flags & IO_WORKER_F_FREE) hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); - raw_spin_unlock(&wqe->lock); - io_wqe_dec_running(worker); - worker->flags = 0; - preempt_disable(); - current->flags &= ~PF_IO_WORKER; - preempt_enable(); + raw_spin_unlock(&wq->lock); + io_wq_dec_running(worker); + /* + * this worker is a goner, clear ->worker_private to avoid any + * inc/dec running calls that could happen as part of exit from + * touching 'worker'. + */ + current->worker_private = NULL; kfree_rcu(worker, rcu); - io_worker_ref_put(wqe->wq); + io_worker_ref_put(wq); do_exit(0); } -static inline bool io_acct_run_queue(struct io_wqe_acct *acct) +static inline bool __io_acct_run_queue(struct io_wq_acct *acct) { - bool ret = false; + return !test_bit(IO_ACCT_STALLED_BIT, &acct->flags) && + !wq_list_empty(&acct->work_list); +} +/* + * If there's work to do, returns true with acct->lock acquired. If not, + * returns false with no lock held. + */ +static inline bool io_acct_run_queue(struct io_wq_acct *acct) + __acquires(&acct->lock) +{ raw_spin_lock(&acct->lock); - if (!wq_list_empty(&acct->work_list) && - !test_bit(IO_ACCT_STALLED_BIT, &acct->flags)) - ret = true; - raw_spin_unlock(&acct->lock); + if (__io_acct_run_queue(acct)) + return true; - return ret; + raw_spin_unlock(&acct->lock); + return false; } /* * Check head of free list for an available worker. If one isn't available, * caller must create one. */ -static bool io_wqe_activate_free_worker(struct io_wqe *wqe, - struct io_wqe_acct *acct) +static bool io_wq_activate_free_worker(struct io_wq *wq, + struct io_wq_acct *acct) __must_hold(RCU) { struct hlist_nulls_node *n; @@ -268,18 +279,21 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe, * activate. If a given worker is on the free_list but in the process * of exiting, keep trying. */ - hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) { + hlist_nulls_for_each_entry_rcu(worker, n, &wq->free_list, nulls_node) { if (!io_worker_get(worker)) continue; - if (io_wqe_get_acct(worker) != acct) { + if (io_wq_get_acct(worker) != acct) { io_worker_release(worker); continue; } - if (wake_up_process(worker->task)) { - io_worker_release(worker); - return true; - } + /* + * If the worker is already running, it's either already + * starting work or finishing work. In either case, if it does + * to go sleep, we'll kick off a new task for this work anyway. + */ + wake_up_process(worker->task); io_worker_release(worker); + return true; } return false; @@ -289,7 +303,7 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe, * We need a worker. If we find a free one, we're good. If not, and we're * below the max number of workers, create one. */ -static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) +static bool io_wq_create_worker(struct io_wq *wq, struct io_wq_acct *acct) { /* * Most likely an attempt to queue unbounded work on an io_wq that @@ -298,21 +312,21 @@ static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) if (unlikely(!acct->max_workers)) pr_warn_once("io-wq is not configured for unbound workers"); - raw_spin_lock(&wqe->lock); + raw_spin_lock(&wq->lock); if (acct->nr_workers >= acct->max_workers) { - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); return true; } acct->nr_workers++; - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); atomic_inc(&acct->nr_running); - atomic_inc(&wqe->wq->worker_refs); - return create_io_worker(wqe->wq, wqe, acct->index); + atomic_inc(&wq->worker_refs); + return create_io_worker(wq, acct->index); } -static void io_wqe_inc_running(struct io_worker *worker) +static void io_wq_inc_running(struct io_worker *worker) { - struct io_wqe_acct *acct = io_wqe_get_acct(worker); + struct io_wq_acct *acct = io_wq_get_acct(worker); atomic_inc(&acct->nr_running); } @@ -321,22 +335,22 @@ static void create_worker_cb(struct callback_head *cb) { struct io_worker *worker; struct io_wq *wq; - struct io_wqe *wqe; - struct io_wqe_acct *acct; + + struct io_wq_acct *acct; bool do_create = false; worker = container_of(cb, struct io_worker, create_work); - wqe = worker->wqe; - wq = wqe->wq; - acct = &wqe->acct[worker->create_index]; - raw_spin_lock(&wqe->lock); + wq = worker->wq; + acct = &wq->acct[worker->create_index]; + raw_spin_lock(&wq->lock); + if (acct->nr_workers < acct->max_workers) { acct->nr_workers++; do_create = true; } - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); if (do_create) { - create_io_worker(wq, wqe, worker->create_index); + create_io_worker(wq, worker->create_index); } else { atomic_dec(&acct->nr_running); io_worker_ref_put(wq); @@ -346,11 +360,10 @@ static void create_worker_cb(struct callback_head *cb) } static bool io_queue_worker_create(struct io_worker *worker, - struct io_wqe_acct *acct, + struct io_wq_acct *acct, task_work_func_t func) { - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; + struct io_wq *wq = worker->wq; /* raced with exit, just ignore create call */ if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) @@ -392,10 +405,10 @@ fail: return false; } -static void io_wqe_dec_running(struct io_worker *worker) +static void io_wq_dec_running(struct io_worker *worker) { - struct io_wqe_acct *acct = io_wqe_get_acct(worker); - struct io_wqe *wqe = worker->wqe; + struct io_wq_acct *acct = io_wq_get_acct(worker); + struct io_wq *wq = worker->wq; if (!(worker->flags & IO_WORKER_F_UP)) return; @@ -405,8 +418,9 @@ static void io_wqe_dec_running(struct io_worker *worker) if (!io_acct_run_queue(acct)) return; + raw_spin_unlock(&acct->lock); atomic_inc(&acct->nr_running); - atomic_inc(&wqe->wq->worker_refs); + atomic_inc(&wq->worker_refs); io_queue_worker_create(worker, acct, create_worker_cb); } @@ -414,29 +428,25 @@ static void io_wqe_dec_running(struct io_worker *worker) * Worker will start processing some work. Move it to the busy list, if * it's currently on the freelist */ -static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker) +static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker) { if (worker->flags & IO_WORKER_F_FREE) { worker->flags &= ~IO_WORKER_F_FREE; - raw_spin_lock(&wqe->lock); + raw_spin_lock(&wq->lock); hlist_nulls_del_init_rcu(&worker->nulls_node); - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); } } /* - * No work, worker going to sleep. Move to freelist, and unuse mm if we - * have one attached. Dropping the mm may potentially sleep, so we drop - * the lock in that case and return success. Since the caller has to - * retry the loop in that case (we changed task state), we don't regrab - * the lock if we return success. + * No work, worker going to sleep. Move to freelist. */ -static void __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker) - __must_hold(wqe->lock) +static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker) + __must_hold(wq->lock) { if (!(worker->flags & IO_WORKER_F_FREE)) { worker->flags |= IO_WORKER_F_FREE; - hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); + hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); } } @@ -445,17 +455,16 @@ static inline unsigned int io_get_work_hash(struct io_wq_work *work) return work->flags >> IO_WQ_HASH_SHIFT; } -static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash) +static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) { - struct io_wq *wq = wqe->wq; bool ret = false; spin_lock_irq(&wq->hash->wait.lock); - if (list_empty(&wqe->wait.entry)) { - __add_wait_queue(&wq->hash->wait, &wqe->wait); + if (list_empty(&wq->wait.entry)) { + __add_wait_queue(&wq->hash->wait, &wq->wait); if (!test_bit(hash, &wq->hash->map)) { __set_current_state(TASK_RUNNING); - list_del_init(&wqe->wait.entry); + list_del_init(&wq->wait.entry); ret = true; } } @@ -463,14 +472,14 @@ static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash) return ret; } -static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct, +static struct io_wq_work *io_get_next_work(struct io_wq_acct *acct, struct io_worker *worker) __must_hold(acct->lock) { struct io_wq_work_node *node, *prev; struct io_wq_work *work, *tail; unsigned int stall_hash = -1U; - struct io_wqe *wqe = worker->wqe; + struct io_wq *wq = worker->wq; wq_list_for_each(node, prev, &acct->work_list) { unsigned int hash; @@ -485,11 +494,11 @@ static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct, hash = io_get_work_hash(work); /* all items with this hash lie in [work, tail] */ - tail = wqe->hash_tail[hash]; + tail = wq->hash_tail[hash]; /* hashed, can run if not already running */ - if (!test_and_set_bit(hash, &wqe->wq->hash->map)) { - wqe->hash_tail[hash] = NULL; + if (!test_and_set_bit(hash, &wq->hash->map)) { + wq->hash_tail[hash] = NULL; wq_list_cut(&acct->work_list, &tail->list, prev); return work; } @@ -508,12 +517,12 @@ static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct, */ set_bit(IO_ACCT_STALLED_BIT, &acct->flags); raw_spin_unlock(&acct->lock); - unstalled = io_wait_on_hash(wqe, stall_hash); + unstalled = io_wait_on_hash(wq, stall_hash); raw_spin_lock(&acct->lock); if (unstalled) { clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); - if (wq_has_sleeper(&wqe->wq->hash->wait)) - wake_up(&wqe->wq->hash->wait); + if (wq_has_sleeper(&wq->hash->wait)) + wake_up(&wq->hash->wait); } } @@ -534,13 +543,14 @@ static void io_assign_current_work(struct io_worker *worker, raw_spin_unlock(&worker->lock); } -static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); - -static void io_worker_handle_work(struct io_worker *worker) +/* + * Called with acct->lock held, drops it before returning + */ +static void io_worker_handle_work(struct io_wq_acct *acct, + struct io_worker *worker) + __releases(&acct->lock) { - struct io_wqe_acct *acct = io_wqe_get_acct(worker); - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; + struct io_wq *wq = worker->wq; bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state); do { @@ -553,11 +563,10 @@ static void io_worker_handle_work(struct io_worker *worker) * can't make progress, any work completion or insertion will * clear the stalled flag. */ - raw_spin_lock(&acct->lock); work = io_get_next_work(acct, worker); raw_spin_unlock(&acct->lock); if (work) { - __io_worker_busy(wqe, worker); + __io_worker_busy(wq, worker); /* * Make sure cancelation can find this, even before @@ -595,7 +604,7 @@ static void io_worker_handle_work(struct io_worker *worker) } io_assign_current_work(worker, work); if (linked) - io_wqe_enqueue(wqe, linked); + io_wq_enqueue(wq, linked); if (hash != -1U && !next_hashed) { /* serialize hash clear with wake_up() */ @@ -607,15 +616,18 @@ static void io_worker_handle_work(struct io_worker *worker) wake_up(&wq->hash->wait); } } while (work); + + if (!__io_acct_run_queue(acct)) + break; + raw_spin_lock(&acct->lock); } while (1); } -static int io_wqe_worker(void *data) +static int io_wq_worker(void *data) { struct io_worker *worker = data; - struct io_wqe_acct *acct = io_wqe_get_acct(worker); - struct io_wqe *wqe = worker->wqe; - struct io_wq *wq = wqe->wq; + struct io_wq_acct *acct = io_wq_get_acct(worker); + struct io_wq *wq = worker->wq; bool exit_mask = false, last_timeout = false; char buf[TASK_COMM_LEN]; @@ -628,23 +640,28 @@ static int io_wqe_worker(void *data) long ret; set_current_state(TASK_INTERRUPTIBLE); - while (io_acct_run_queue(acct)) - io_worker_handle_work(worker); - raw_spin_lock(&wqe->lock); + /* + * If we have work to do, io_acct_run_queue() returns with + * the acct->lock held. If not, it will drop it. + */ + while (io_acct_run_queue(acct)) + io_worker_handle_work(acct, worker); + + raw_spin_lock(&wq->lock); /* * Last sleep timed out. Exit if we're not the last worker, * or if someone modified our affinity. */ if (last_timeout && (exit_mask || acct->nr_workers > 1)) { acct->nr_workers--; - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); __set_current_state(TASK_RUNNING); break; } last_timeout = false; - __io_worker_idle(wqe, worker); - raw_spin_unlock(&wqe->lock); + __io_worker_idle(wq, worker); + raw_spin_unlock(&wq->lock); if (io_run_task_work()) continue; ret = schedule_timeout(WORKER_IDLE_TIMEOUT); @@ -658,12 +675,12 @@ static int io_wqe_worker(void *data) if (!ret) { last_timeout = true; exit_mask = !cpumask_test_cpu(raw_smp_processor_id(), - wqe->cpu_mask); + wq->cpu_mask); } } - if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) - io_worker_handle_work(worker); + if (test_bit(IO_WQ_BIT_EXIT, &wq->state) && io_acct_run_queue(acct)) + io_worker_handle_work(acct, worker); io_worker_exit(worker); return 0; @@ -683,7 +700,7 @@ void io_wq_worker_running(struct task_struct *tsk) if (worker->flags & IO_WORKER_F_RUNNING) return; worker->flags |= IO_WORKER_F_RUNNING; - io_wqe_inc_running(worker); + io_wq_inc_running(worker); } /* @@ -702,21 +719,21 @@ void io_wq_worker_sleeping(struct task_struct *tsk) return; worker->flags &= ~IO_WORKER_F_RUNNING; - io_wqe_dec_running(worker); + io_wq_dec_running(worker); } -static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker, +static void io_init_new_worker(struct io_wq *wq, struct io_worker *worker, struct task_struct *tsk) { tsk->worker_private = worker; worker->task = tsk; - set_cpus_allowed_ptr(tsk, wqe->cpu_mask); + set_cpus_allowed_ptr(tsk, wq->cpu_mask); - raw_spin_lock(&wqe->lock); - hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); - list_add_tail_rcu(&worker->all_list, &wqe->all_list); + raw_spin_lock(&wq->lock); + hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); + list_add_tail_rcu(&worker->all_list, &wq->all_list); worker->flags |= IO_WORKER_F_FREE; - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); wake_up_new_task(tsk); } @@ -749,21 +766,21 @@ static void create_worker_cont(struct callback_head *cb) { struct io_worker *worker; struct task_struct *tsk; - struct io_wqe *wqe; + struct io_wq *wq; worker = container_of(cb, struct io_worker, create_work); clear_bit_unlock(0, &worker->create_state); - wqe = worker->wqe; - tsk = create_io_thread(io_wqe_worker, worker, wqe->node); + wq = worker->wq; + tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { - io_init_new_worker(wqe, worker, tsk); + io_init_new_worker(wq, worker, tsk); io_worker_release(worker); return; } else if (!io_should_retry_thread(PTR_ERR(tsk))) { - struct io_wqe_acct *acct = io_wqe_get_acct(worker); + struct io_wq_acct *acct = io_wq_get_acct(worker); atomic_dec(&acct->nr_running); - raw_spin_lock(&wqe->lock); + raw_spin_lock(&wq->lock); acct->nr_workers--; if (!acct->nr_workers) { struct io_cb_cancel_data match = { @@ -771,13 +788,13 @@ static void create_worker_cont(struct callback_head *cb) .cancel_all = true, }; - raw_spin_unlock(&wqe->lock); - while (io_acct_cancel_pending_work(wqe, acct, &match)) + raw_spin_unlock(&wq->lock); + while (io_acct_cancel_pending_work(wq, acct, &match)) ; } else { - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); } - io_worker_ref_put(wqe->wq); + io_worker_ref_put(wq); kfree(worker); return; } @@ -790,42 +807,42 @@ static void create_worker_cont(struct callback_head *cb) static void io_workqueue_create(struct work_struct *work) { struct io_worker *worker = container_of(work, struct io_worker, work); - struct io_wqe_acct *acct = io_wqe_get_acct(worker); + struct io_wq_acct *acct = io_wq_get_acct(worker); if (!io_queue_worker_create(worker, acct, create_worker_cont)) kfree(worker); } -static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) +static bool create_io_worker(struct io_wq *wq, int index) { - struct io_wqe_acct *acct = &wqe->acct[index]; + struct io_wq_acct *acct = &wq->acct[index]; struct io_worker *worker; struct task_struct *tsk; __set_current_state(TASK_RUNNING); - worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node); + worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) { fail: atomic_dec(&acct->nr_running); - raw_spin_lock(&wqe->lock); + raw_spin_lock(&wq->lock); acct->nr_workers--; - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); io_worker_ref_put(wq); return false; } refcount_set(&worker->ref, 1); - worker->wqe = wqe; + worker->wq = wq; raw_spin_lock_init(&worker->lock); init_completion(&worker->ref_done); if (index == IO_WQ_ACCT_BOUND) worker->flags |= IO_WORKER_F_BOUND; - tsk = create_io_thread(io_wqe_worker, worker, wqe->node); + tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { - io_init_new_worker(wqe, worker, tsk); + io_init_new_worker(wq, worker, tsk); } else if (!io_should_retry_thread(PTR_ERR(tsk))) { kfree(worker); goto fail; @@ -841,14 +858,14 @@ fail: * Iterate the passed in list and call the specific function for each * worker that isn't exiting */ -static bool io_wq_for_each_worker(struct io_wqe *wqe, +static bool io_wq_for_each_worker(struct io_wq *wq, bool (*func)(struct io_worker *, void *), void *data) { struct io_worker *worker; bool ret = false; - list_for_each_entry_rcu(worker, &wqe->all_list, all_list) { + list_for_each_entry_rcu(worker, &wq->all_list, all_list) { if (io_worker_get(worker)) { /* no task if node is/was offline */ if (worker->task) @@ -869,10 +886,8 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data) return false; } -static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) +static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq) { - struct io_wq *wq = wqe->wq; - do { work->flags |= IO_WQ_WORK_CANCEL; wq->do_work(work); @@ -880,9 +895,9 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) } while (work); } -static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work) +static void io_wq_insert_work(struct io_wq *wq, struct io_wq_work *work) { - struct io_wqe_acct *acct = io_work_get_acct(wqe, work); + struct io_wq_acct *acct = io_work_get_acct(wq, work); unsigned int hash; struct io_wq_work *tail; @@ -893,8 +908,8 @@ append: } hash = io_get_work_hash(work); - tail = wqe->hash_tail[hash]; - wqe->hash_tail[hash] = work; + tail = wq->hash_tail[hash]; + wq->hash_tail[hash] = work; if (!tail) goto append; @@ -906,9 +921,9 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data) return work == data; } -static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) +void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) { - struct io_wqe_acct *acct = io_work_get_acct(wqe, work); + struct io_wq_acct *acct = io_work_get_acct(wq, work); struct io_cb_cancel_data match; unsigned work_flags = work->flags; bool do_create; @@ -917,55 +932,45 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) * If io-wq is exiting for this task, or if the request has explicitly * been marked as one that should not get executed, cancel it here. */ - if (test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state) || + if (test_bit(IO_WQ_BIT_EXIT, &wq->state) || (work->flags & IO_WQ_WORK_CANCEL)) { - io_run_cancel(work, wqe); + io_run_cancel(work, wq); return; } raw_spin_lock(&acct->lock); - io_wqe_insert_work(wqe, work); + io_wq_insert_work(wq, work); clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); raw_spin_unlock(&acct->lock); - raw_spin_lock(&wqe->lock); rcu_read_lock(); - do_create = !io_wqe_activate_free_worker(wqe, acct); + do_create = !io_wq_activate_free_worker(wq, acct); rcu_read_unlock(); - raw_spin_unlock(&wqe->lock); - if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) || !atomic_read(&acct->nr_running))) { bool did_create; - did_create = io_wqe_create_worker(wqe, acct); + did_create = io_wq_create_worker(wq, acct); if (likely(did_create)) return; - raw_spin_lock(&wqe->lock); + raw_spin_lock(&wq->lock); if (acct->nr_workers) { - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); return; } - raw_spin_unlock(&wqe->lock); + raw_spin_unlock(&wq->lock); /* fatal condition, failed to create the first worker */ match.fn = io_wq_work_match_item, match.data = work, match.cancel_all = false, - io_acct_cancel_pending_work(wqe, acct, &match); + io_acct_cancel_pending_work(wq, acct, &match); } } -void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) -{ - struct io_wqe *wqe = wq->wqes[numa_node_id()]; - - io_wqe_enqueue(wqe, work); -} - /* * Work items that hash to the same value will not be done in parallel. * Used to limit concurrent writes, generally hashed by inode. @@ -1008,27 +1013,27 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) return match->nr_running && !match->cancel_all; } -static inline void io_wqe_remove_pending(struct io_wqe *wqe, +static inline void io_wq_remove_pending(struct io_wq *wq, struct io_wq_work *work, struct io_wq_work_node *prev) { - struct io_wqe_acct *acct = io_work_get_acct(wqe, work); + struct io_wq_acct *acct = io_work_get_acct(wq, work); unsigned int hash = io_get_work_hash(work); struct io_wq_work *prev_work = NULL; - if (io_wq_is_hashed(work) && work == wqe->hash_tail[hash]) { + if (io_wq_is_hashed(work) && work == wq->hash_tail[hash]) { if (prev) prev_work = container_of(prev, struct io_wq_work, list); if (prev_work && io_get_work_hash(prev_work) == hash) - wqe->hash_tail[hash] = prev_work; + wq->hash_tail[hash] = prev_work; else - wqe->hash_tail[hash] = NULL; + wq->hash_tail[hash] = NULL; } wq_list_del(&acct->work_list, &work->list, prev); } -static bool io_acct_cancel_pending_work(struct io_wqe *wqe, - struct io_wqe_acct *acct, +static bool io_acct_cancel_pending_work(struct io_wq *wq, + struct io_wq_acct *acct, struct io_cb_cancel_data *match) { struct io_wq_work_node *node, *prev; @@ -1039,9 +1044,9 @@ static bool io_acct_cancel_pending_work(struct io_wqe *wqe, work = container_of(node, struct io_wq_work, list); if (!match->fn(work, match->data)) continue; - io_wqe_remove_pending(wqe, work, prev); + io_wq_remove_pending(wq, work, prev); raw_spin_unlock(&acct->lock); - io_run_cancel(work, wqe); + io_run_cancel(work, wq); match->nr_pending++; /* not safe to continue after unlock */ return true; @@ -1051,15 +1056,15 @@ static bool io_acct_cancel_pending_work(struct io_wqe *wqe, return false; } -static void io_wqe_cancel_pending_work(struct io_wqe *wqe, - struct io_cb_cancel_data *match) +static void io_wq_cancel_pending_work(struct io_wq *wq, + struct io_cb_cancel_data *match) { int i; retry: for (i = 0; i < IO_WQ_ACCT_NR; i++) { - struct io_wqe_acct *acct = io_get_acct(wqe, i == 0); + struct io_wq_acct *acct = io_get_acct(wq, i == 0); - if (io_acct_cancel_pending_work(wqe, acct, match)) { + if (io_acct_cancel_pending_work(wq, acct, match)) { if (match->cancel_all) goto retry; break; @@ -1067,11 +1072,11 @@ retry: } } -static void io_wqe_cancel_running_work(struct io_wqe *wqe, +static void io_wq_cancel_running_work(struct io_wq *wq, struct io_cb_cancel_data *match) { rcu_read_lock(); - io_wq_for_each_worker(wqe, io_wq_worker_cancel, match); + io_wq_for_each_worker(wq, io_wq_worker_cancel, match); rcu_read_unlock(); } @@ -1083,7 +1088,6 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, .data = data, .cancel_all = cancel_all, }; - int node; /* * First check pending list, if we're lucky we can just remove it @@ -1095,22 +1099,18 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, * as an indication that we attempt to signal cancellation. The * completion will run normally in this case. * - * Do both of these while holding the wqe->lock, to ensure that + * Do both of these while holding the wq->lock, to ensure that * we'll find a work item regardless of state. */ - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; + io_wq_cancel_pending_work(wq, &match); + if (match.nr_pending && !match.cancel_all) + return IO_WQ_CANCEL_OK; - io_wqe_cancel_pending_work(wqe, &match); - if (match.nr_pending && !match.cancel_all) - return IO_WQ_CANCEL_OK; - - raw_spin_lock(&wqe->lock); - io_wqe_cancel_running_work(wqe, &match); - raw_spin_unlock(&wqe->lock); - if (match.nr_running && !match.cancel_all) - return IO_WQ_CANCEL_RUNNING; - } + raw_spin_lock(&wq->lock); + io_wq_cancel_running_work(wq, &match); + raw_spin_unlock(&wq->lock); + if (match.nr_running && !match.cancel_all) + return IO_WQ_CANCEL_RUNNING; if (match.nr_running) return IO_WQ_CANCEL_RUNNING; @@ -1119,20 +1119,20 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, return IO_WQ_CANCEL_NOTFOUND; } -static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode, +static int io_wq_hash_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { - struct io_wqe *wqe = container_of(wait, struct io_wqe, wait); + struct io_wq *wq = container_of(wait, struct io_wq, wait); int i; list_del_init(&wait->entry); rcu_read_lock(); for (i = 0; i < IO_WQ_ACCT_NR; i++) { - struct io_wqe_acct *acct = &wqe->acct[i]; + struct io_wq_acct *acct = &wq->acct[i]; if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags)) - io_wqe_activate_free_worker(wqe, acct); + io_wq_activate_free_worker(wq, acct); } rcu_read_unlock(); return 1; @@ -1140,7 +1140,7 @@ static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode, struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) { - int ret, node, i; + int ret, i; struct io_wq *wq; if (WARN_ON_ONCE(!data->free_work || !data->do_work)) @@ -1148,12 +1148,9 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) if (WARN_ON_ONCE(!bounded)) return ERR_PTR(-EINVAL); - wq = kzalloc(struct_size(wq, wqes, nr_node_ids), GFP_KERNEL); + wq = kzalloc(sizeof(struct io_wq), GFP_KERNEL); if (!wq) return ERR_PTR(-ENOMEM); - ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node); - if (ret) - goto err_wq; refcount_inc(&data->hash->refs); wq->hash = data->hash; @@ -1161,53 +1158,39 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) wq->do_work = data->do_work; ret = -ENOMEM; - for_each_node(node) { - struct io_wqe *wqe; - int alloc_node = node; - if (!node_online(alloc_node)) - alloc_node = NUMA_NO_NODE; - wqe = kzalloc_node(sizeof(struct io_wqe), GFP_KERNEL, alloc_node); - if (!wqe) - goto err; - wq->wqes[node] = wqe; - if (!alloc_cpumask_var(&wqe->cpu_mask, GFP_KERNEL)) - goto err; - cpumask_copy(wqe->cpu_mask, cpumask_of_node(node)); - wqe->node = alloc_node; - wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; - wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers = - task_rlimit(current, RLIMIT_NPROC); - INIT_LIST_HEAD(&wqe->wait.entry); - wqe->wait.func = io_wqe_hash_wake; - for (i = 0; i < IO_WQ_ACCT_NR; i++) { - struct io_wqe_acct *acct = &wqe->acct[i]; + if (!alloc_cpumask_var(&wq->cpu_mask, GFP_KERNEL)) + goto err; + cpumask_copy(wq->cpu_mask, cpu_possible_mask); + wq->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; + wq->acct[IO_WQ_ACCT_UNBOUND].max_workers = + task_rlimit(current, RLIMIT_NPROC); + INIT_LIST_HEAD(&wq->wait.entry); + wq->wait.func = io_wq_hash_wake; + for (i = 0; i < IO_WQ_ACCT_NR; i++) { + struct io_wq_acct *acct = &wq->acct[i]; - acct->index = i; - atomic_set(&acct->nr_running, 0); - INIT_WQ_LIST(&acct->work_list); - raw_spin_lock_init(&acct->lock); - } - wqe->wq = wq; - raw_spin_lock_init(&wqe->lock); - INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); - INIT_LIST_HEAD(&wqe->all_list); + acct->index = i; + atomic_set(&acct->nr_running, 0); + INIT_WQ_LIST(&acct->work_list); + raw_spin_lock_init(&acct->lock); } + raw_spin_lock_init(&wq->lock); + INIT_HLIST_NULLS_HEAD(&wq->free_list, 0); + INIT_LIST_HEAD(&wq->all_list); + wq->task = get_task_struct(data->task); atomic_set(&wq->worker_refs, 1); init_completion(&wq->worker_done); + ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node); + if (ret) + goto err; + return wq; err: io_wq_put_hash(data->hash); - cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); - for_each_node(node) { - if (!wq->wqes[node]) - continue; - free_cpumask_var(wq->wqes[node]->cpu_mask); - kfree(wq->wqes[node]); - } -err_wq: + free_cpumask_var(wq->cpu_mask); kfree(wq); return ERR_PTR(ret); } @@ -1219,7 +1202,7 @@ static bool io_task_work_match(struct callback_head *cb, void *data) if (cb->func != create_worker_cb && cb->func != create_worker_cont) return false; worker = container_of(cb, struct io_worker, create_work); - return worker->wqe->wq == data; + return worker->wq == data; } void io_wq_exit_start(struct io_wq *wq) @@ -1247,48 +1230,35 @@ static void io_wq_cancel_tw_create(struct io_wq *wq) static void io_wq_exit_workers(struct io_wq *wq) { - int node; - if (!wq->task) return; io_wq_cancel_tw_create(wq); rcu_read_lock(); - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - - io_wq_for_each_worker(wqe, io_wq_worker_wake, NULL); - } + io_wq_for_each_worker(wq, io_wq_worker_wake, NULL); rcu_read_unlock(); io_worker_ref_put(wq); wait_for_completion(&wq->worker_done); - for_each_node(node) { - spin_lock_irq(&wq->hash->wait.lock); - list_del_init(&wq->wqes[node]->wait.entry); - spin_unlock_irq(&wq->hash->wait.lock); - } + spin_lock_irq(&wq->hash->wait.lock); + list_del_init(&wq->wait.entry); + spin_unlock_irq(&wq->hash->wait.lock); + put_task_struct(wq->task); wq->task = NULL; } static void io_wq_destroy(struct io_wq *wq) { - int node; + struct io_cb_cancel_data match = { + .fn = io_wq_work_match_all, + .cancel_all = true, + }; cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); - - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - struct io_cb_cancel_data match = { - .fn = io_wq_work_match_all, - .cancel_all = true, - }; - io_wqe_cancel_pending_work(wqe, &match); - free_cpumask_var(wqe->cpu_mask); - kfree(wqe); - } + io_wq_cancel_pending_work(wq, &match); + free_cpumask_var(wq->cpu_mask); io_wq_put_hash(wq->hash); kfree(wq); } @@ -1311,9 +1281,9 @@ static bool io_wq_worker_affinity(struct io_worker *worker, void *data) struct online_data *od = data; if (od->online) - cpumask_set_cpu(od->cpu, worker->wqe->cpu_mask); + cpumask_set_cpu(od->cpu, worker->wq->cpu_mask); else - cpumask_clear_cpu(od->cpu, worker->wqe->cpu_mask); + cpumask_clear_cpu(od->cpu, worker->wq->cpu_mask); return false; } @@ -1323,11 +1293,9 @@ static int __io_wq_cpu_online(struct io_wq *wq, unsigned int cpu, bool online) .cpu = cpu, .online = online }; - int i; rcu_read_lock(); - for_each_node(i) - io_wq_for_each_worker(wq->wqes[i], io_wq_worker_affinity, &od); + io_wq_for_each_worker(wq, io_wq_worker_affinity, &od); rcu_read_unlock(); return 0; } @@ -1346,20 +1314,18 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node) return __io_wq_cpu_online(wq, cpu, false); } -int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask) +int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask) { - int i; + if (!tctx || !tctx->io_wq) + return -EINVAL; rcu_read_lock(); - for_each_node(i) { - struct io_wqe *wqe = wq->wqes[i]; - - if (mask) - cpumask_copy(wqe->cpu_mask, mask); - else - cpumask_copy(wqe->cpu_mask, cpumask_of_node(i)); - } + if (mask) + cpumask_copy(tctx->io_wq->cpu_mask, mask); + else + cpumask_copy(tctx->io_wq->cpu_mask, cpu_possible_mask); rcu_read_unlock(); + return 0; } @@ -1369,9 +1335,9 @@ int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask) */ int io_wq_max_workers(struct io_wq *wq, int *new_count) { + struct io_wq_acct *acct; int prev[IO_WQ_ACCT_NR]; - bool first_node = true; - int i, node; + int i; BUILD_BUG_ON((int) IO_WQ_ACCT_BOUND != (int) IO_WQ_BOUND); BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND); @@ -1386,21 +1352,15 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count) prev[i] = 0; rcu_read_lock(); - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - struct io_wqe_acct *acct; - raw_spin_lock(&wqe->lock); - for (i = 0; i < IO_WQ_ACCT_NR; i++) { - acct = &wqe->acct[i]; - if (first_node) - prev[i] = max_t(int, acct->max_workers, prev[i]); - if (new_count[i]) - acct->max_workers = new_count[i]; - } - raw_spin_unlock(&wqe->lock); - first_node = false; + raw_spin_lock(&wq->lock); + for (i = 0; i < IO_WQ_ACCT_NR; i++) { + acct = &wq->acct[i]; + prev[i] = max_t(int, acct->max_workers, prev[i]); + if (new_count[i]) + acct->max_workers = new_count[i]; } + raw_spin_unlock(&wq->lock); rcu_read_unlock(); for (i = 0; i < IO_WQ_ACCT_NR; i++) diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h index 31228426d192..2b2a6406dd8e 100644 --- a/io_uring/io-wq.h +++ b/io_uring/io-wq.h @@ -50,8 +50,9 @@ void io_wq_put_and_exit(struct io_wq *wq); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); void io_wq_hash_work(struct io_wq_work *work, void *val); -int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask); +int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask); int io_wq_max_workers(struct io_wq *wq, int *new_count); +bool io_wq_worker_stopped(void); static inline bool io_wq_is_hashed(struct io_wq_work *work) { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index bdd36d57f1b9..fad13cb99df1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -73,6 +73,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -95,6 +96,7 @@ #include "timeout.h" #include "poll.h" +#include "rw.h" #include "alloc_cache.h" #define IORING_MAX_ENTRIES 32768 @@ -145,16 +147,13 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all); -static void io_dismantle_req(struct io_kiocb *req); -static void io_clean_op(struct io_kiocb *req); static void io_queue_sqe(struct io_kiocb *req); -static void io_move_task_work_from_local(struct io_ring_ctx *ctx); -static void __io_submit_flush_completions(struct io_ring_ctx *ctx); -static __cold void io_fallback_tw(struct io_uring_task *tctx); -static struct kmem_cache *req_cachep; +struct kmem_cache *req_cachep; static int __read_mostly sysctl_io_uring_disabled = 2; +static int __read_mostly sysctl_io_uring_group = -1; + #ifdef CONFIG_SYSCTL static struct ctl_table kernel_io_uring_disabled_table[] = { { @@ -166,6 +165,13 @@ static struct ctl_table kernel_io_uring_disabled_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, + { + .procname = "io_uring_group", + .data = &sysctl_io_uring_group, + .maxlen = sizeof(gid_t), + .mode = 0644, + .proc_handler = proc_dointvec, + }, {}, }; #endif @@ -262,17 +268,15 @@ static __cold void io_fallback_req_func(struct work_struct *work) fallback_work.work); struct llist_node *node = llist_del_all(&ctx->fallback_llist); struct io_kiocb *req, *tmp; - bool locked = false; + struct io_tw_state ts = { .locked = true, }; - percpu_ref_get(&ctx->refs); + mutex_lock(&ctx->uring_lock); llist_for_each_entry_safe(req, tmp, node, io_task_work.node) - req->io_task_work.func(req, &locked); - - if (locked) { - io_submit_flush_completions(ctx); - mutex_unlock(&ctx->uring_lock); - } - percpu_ref_put(&ctx->refs); + req->io_task_work.func(req, &ts); + if (WARN_ON_ONCE(!ts.locked)) + return; + io_submit_flush_completions(ctx); + mutex_unlock(&ctx->uring_lock); } static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) @@ -311,13 +315,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) goto err; if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits)) goto err; - - ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); - if (!ctx->dummy_ubuf) - goto err; - /* set invalid range, so io_import_fixed() fails meeting it */ - ctx->dummy_ubuf->ubuf = -1UL; - if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) goto err; @@ -327,12 +324,18 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); - io_alloc_cache_init(&ctx->apoll_cache); - io_alloc_cache_init(&ctx->netmsg_cache); + io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, + sizeof(struct io_rsrc_node)); + io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct async_poll)); + io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct io_async_msghdr)); init_completion(&ctx->ref_comp); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->cq_wait); + init_waitqueue_head(&ctx->poll_wq); + init_waitqueue_head(&ctx->rsrc_quiesce_wq); spin_lock_init(&ctx->completion_lock); spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); @@ -341,11 +344,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); - spin_lock_init(&ctx->rsrc_ref_lock); INIT_LIST_HEAD(&ctx->rsrc_ref_list); - INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); - init_task_work(&ctx->rsrc_put_tw, io_rsrc_put_tw); - init_llist_head(&ctx->rsrc_put_llist); init_llist_head(&ctx->work_llist); INIT_LIST_HEAD(&ctx->tctx_list); ctx->submit_state.free_list.next = NULL; @@ -354,7 +353,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_WQ_LIST(&ctx->submit_state.compl_reqs); return ctx; err: - kfree(ctx->dummy_ubuf); kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table_locked.hbs); kfree(ctx->io_bl); @@ -382,6 +380,39 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) return false; } +static void io_clean_op(struct io_kiocb *req) +{ + if (req->flags & REQ_F_BUFFER_SELECTED) { + spin_lock(&req->ctx->completion_lock); + io_put_kbuf_comp(req); + spin_unlock(&req->ctx->completion_lock); + } + + if (req->flags & REQ_F_NEED_CLEANUP) { + const struct io_cold_def *def = &io_cold_defs[req->opcode]; + + if (def->cleanup) + def->cleanup(req); + } + if ((req->flags & REQ_F_POLLED) && req->apoll) { + kfree(req->apoll->double_poll); + kfree(req->apoll); + req->apoll = NULL; + } + if (req->flags & REQ_F_INFLIGHT) { + struct io_uring_task *tctx = req->task->io_uring; + + atomic_dec(&tctx->inflight_tracked); + } + if (req->flags & REQ_F_CREDS) + put_cred(req->creds); + if (req->flags & REQ_F_ASYNC_DATA) { + kfree(req->async_data); + req->async_data = NULL; + } + req->flags &= ~IO_REQ_CLEAN_FLAGS; +} + static inline void io_req_track_inflight(struct io_kiocb *req) { if (!(req->flags & REQ_F_INFLIGHT)) { @@ -424,7 +455,7 @@ static inline void io_arm_ltimeout(struct io_kiocb *req) static void io_prep_async_work(struct io_kiocb *req) { - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_issue_def *def = &io_issue_defs[req->opcode]; struct io_ring_ctx *ctx = req->ctx; if (!(req->flags & REQ_F_CREDS)) { @@ -438,11 +469,17 @@ static void io_prep_async_work(struct io_kiocb *req) if (req->flags & REQ_F_FORCE_ASYNC) req->work.flags |= IO_WQ_WORK_CONCURRENT; - if (req->file && !io_req_ffs_set(req)) - req->flags |= io_file_get_flags(req->file) << REQ_F_SUPPORT_NOWAIT_BIT; + if (req->file && !(req->flags & REQ_F_FIXED_FILE)) + req->flags |= io_file_get_flags(req->file); - if (req->flags & REQ_F_ISREG) { - if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) + if (req->file && (req->flags & REQ_F_ISREG)) { + bool should_hash = def->hash_reg_file; + + /* don't serialize this request if the fs doesn't need it */ + if (should_hash && (req->file->f_flags & O_DIRECT) && + (req->file->f_mode & FMODE_DIO_PARALLEL_WRITE)) + should_hash = false; + if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { if (def->unbound_nonreg_file) @@ -467,7 +504,7 @@ static void io_prep_async_link(struct io_kiocb *req) } } -void io_queue_iowq(struct io_kiocb *req, bool *dont_use) +void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use) { struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; @@ -589,6 +626,8 @@ static void io_eventfd_flush_signal(struct io_ring_ctx *ctx) void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { + if (ctx->poll_activated) + io_poll_wq_wake(ctx); if (ctx->off_timeout_used) io_flush_timeouts(ctx); if (ctx->drain_active) { @@ -601,47 +640,37 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) } static inline void __io_cq_lock(struct io_ring_ctx *ctx) - __acquires(ctx->completion_lock) { - if (!ctx->task_complete) + if (!ctx->lockless_cq) spin_lock(&ctx->completion_lock); } -static inline void __io_cq_unlock(struct io_ring_ctx *ctx) -{ - if (!ctx->task_complete) - spin_unlock(&ctx->completion_lock); -} - static inline void io_cq_lock(struct io_ring_ctx *ctx) __acquires(ctx->completion_lock) { spin_lock(&ctx->completion_lock); } -static inline void io_cq_unlock(struct io_ring_ctx *ctx) - __releases(ctx->completion_lock) -{ - spin_unlock(&ctx->completion_lock); -} - -/* keep it inlined for io_submit_flush_completions() */ static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) - __releases(ctx->completion_lock) { io_commit_cqring(ctx); - __io_cq_unlock(ctx); + if (!ctx->task_complete) { + if (!ctx->lockless_cq) + spin_unlock(&ctx->completion_lock); + /* IOPOLL rings only need to wake up if it's also SQPOLL */ + if (!ctx->syscall_iopoll) + io_cqring_wake(ctx); + } io_commit_cqring_flush(ctx); - io_cqring_wake(ctx); } -void io_cq_unlock_post(struct io_ring_ctx *ctx) +static void io_cq_unlock_post(struct io_ring_ctx *ctx) __releases(ctx->completion_lock) { io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); - io_commit_cqring_flush(ctx); io_cqring_wake(ctx); + io_commit_cqring_flush(ctx); } /* Returns true if there are no backlogged entries after the flush */ @@ -650,10 +679,10 @@ static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) struct io_overflow_cqe *ocqe; LIST_HEAD(list); - io_cq_lock(ctx); + spin_lock(&ctx->completion_lock); list_splice_init(&ctx->cq_overflow_list, &list); clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); - io_cq_unlock(ctx); + spin_unlock(&ctx->completion_lock); while (!list_empty(&list)) { ocqe = list_first_entry(&list, struct io_overflow_cqe, list); @@ -662,7 +691,6 @@ static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) } } -/* Returns true if there are no backlogged entries after the flush */ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) { size_t cqe_size = sizeof(struct io_uring_cqe); @@ -675,10 +703,10 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) io_cq_lock(ctx); while (!list_empty(&ctx->cq_overflow_list)) { - struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true); + struct io_uring_cqe *cqe; struct io_overflow_cqe *ocqe; - if (!cqe) + if (!io_get_cqe_overflow(ctx, &cqe, true)) break; ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); @@ -710,14 +738,30 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx) io_cqring_do_overflow_flush(ctx); } -void __io_put_task(struct task_struct *task, int nr) +/* can be called by any task */ +static void io_put_task_remote(struct task_struct *task) { struct io_uring_task *tctx = task->io_uring; - percpu_counter_sub(&tctx->inflight, nr); - if (unlikely(atomic_read(&tctx->in_idle))) + percpu_counter_sub(&tctx->inflight, 1); + if (unlikely(atomic_read(&tctx->in_cancel))) wake_up(&tctx->wait); - put_task_struct_many(task, nr); + put_task_struct(task); +} + +/* used by a task to put its own references */ +static void io_put_task_local(struct task_struct *task) +{ + task->io_uring->cached_refs++; +} + +/* must to be called somewhat shortly after putting a request */ +static inline void io_put_task(struct task_struct *task) +{ + if (likely(task == current)) + io_put_task_local(task); + else + io_put_task_remote(task); } void io_task_refs_refill(struct io_uring_task *tctx) @@ -781,15 +825,12 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, return true; } -bool io_req_cqe_overflow(struct io_kiocb *req) +void io_req_cqe_overflow(struct io_kiocb *req) { - if (!(req->flags & REQ_F_CQE32_INIT)) { - req->extra1 = 0; - req->extra2 = 0; - } - return io_cqring_event_overflow(req->ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - req->extra1, req->extra2); + io_cqring_event_overflow(req->ctx, req->cqe.user_data, + req->cqe.res, req->cqe.flags, + req->big_cqe.extra1, req->big_cqe.extra2); + memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } /* @@ -797,7 +838,7 @@ bool io_req_cqe_overflow(struct io_kiocb *req) * control dependency is enough as we're using WRITE_ONCE to * fill the cq entry */ -struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) +bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) { struct io_rings *rings = ctx->rings; unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); @@ -809,7 +850,7 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) * Force overflow the completion. */ if (!overflow && (ctx->check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT))) - return NULL; + return false; /* userspace may cheat modifying the tail, be safe and do min */ queued = min(__io_cqring_events(ctx), ctx->cq_entries); @@ -817,7 +858,7 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) /* we need a contiguous range, limit based on the current array offset */ len = min(free, ctx->cq_entries - off); if (!len) - return NULL; + return false; if (ctx->flags & IORING_SETUP_CQE32) { off <<= 1; @@ -826,12 +867,7 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) ctx->cqe_cached = &rings->cqes[off]; ctx->cqe_sentinel = ctx->cqe_cached + len; - - ctx->cached_cq_tail++; - ctx->cqe_cached++; - if (ctx->flags & IORING_SETUP_CQE32) - ctx->cqe_cached++; - return &rings->cqes[off]; + return true; } static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, @@ -846,8 +882,7 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, * submission (by quite a lot). Increment the overflow count in * the ring. */ - cqe = io_get_cqe(ctx); - if (likely(cqe)) { + if (likely(io_get_cqe(ctx, &cqe))) { trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); WRITE_ONCE(cqe->user_data, user_data); @@ -871,10 +906,10 @@ static void __io_flush_post_cqes(struct io_ring_ctx *ctx) lockdep_assert_held(&ctx->uring_lock); for (i = 0; i < state->cqes_count; i++) { - struct io_uring_cqe *cqe = &state->cqes[i]; + struct io_uring_cqe *cqe = &ctx->completion_cqes[i]; if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) { - if (ctx->task_complete) { + if (ctx->lockless_cq) { spin_lock(&ctx->completion_lock); io_cqring_event_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, 0, 0); @@ -907,20 +942,22 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags return __io_post_aux_cqe(ctx, user_data, res, cflags, true); } -bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags, - bool allow_overflow) +/* + * A helper for multishot requests posting additional CQEs. + * Should only be used from a task_work including IO_URING_F_MULTISHOT. + */ +bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags) { + struct io_ring_ctx *ctx = req->ctx; + u64 user_data = req->cqe.user_data; struct io_uring_cqe *cqe; - unsigned int length; if (!defer) - return __io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow); - - length = ARRAY_SIZE(ctx->submit_state.cqes); + return __io_post_aux_cqe(ctx, user_data, res, cflags, false); lockdep_assert_held(&ctx->uring_lock); - if (ctx->submit_state.cqes_count == length) { + if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->completion_cqes)) { __io_cq_lock(ctx); __io_flush_post_cqes(ctx); /* no need to flush - flush is deferred */ @@ -931,23 +968,26 @@ bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 * however it's main job is to prevent unbounded posted completions, * and in that it works just as well. */ - if (!allow_overflow && test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) + if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) return false; - cqe = &ctx->submit_state.cqes[ctx->submit_state.cqes_count++]; + cqe = &ctx->completion_cqes[ctx->submit_state.cqes_count++]; cqe->user_data = user_data; cqe->res = res; cqe->flags = cflags; return true; } -static void __io_req_complete_post(struct io_kiocb *req) +static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; + struct io_rsrc_node *rsrc_node = NULL; io_cq_lock(ctx); - if (!(req->flags & REQ_F_CQE_SKIP)) - io_fill_cqe_req(ctx, req); + if (!(req->flags & REQ_F_CQE_SKIP)) { + if (!io_fill_cqe_req(ctx, req)) + io_req_cqe_overflow(req); + } /* * If we're the last reference to this request, add to our locked @@ -962,19 +1002,28 @@ static void __io_req_complete_post(struct io_kiocb *req) req->link = NULL; } } - io_req_put_rsrc(req); + io_put_kbuf_comp(req); + if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) + io_clean_op(req); + io_put_file(req); + + rsrc_node = req->rsrc_node; /* * Selected buffer deallocation in io_clean_op() assumes that * we don't hold ->completion_lock. Clean them here to avoid * deadlocks. */ - io_put_kbuf_comp(req); - io_dismantle_req(req); - io_put_task(req->task, 1); + io_put_task_remote(req->task); wq_list_add_head(&req->comp_list, &ctx->locked_free_list); ctx->locked_free_nr++; } io_cq_unlock_post(ctx); + + if (rsrc_node) { + io_ring_submit_lock(ctx, issue_flags); + io_put_rsrc_node(ctx, rsrc_node); + io_ring_submit_unlock(ctx, issue_flags); + } } void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) @@ -984,12 +1033,12 @@ void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) io_req_task_work_add(req); } else if (!(issue_flags & IO_URING_F_UNLOCKED) || !(req->ctx->flags & IORING_SETUP_IOPOLL)) { - __io_req_complete_post(req); + __io_req_complete_post(req, issue_flags); } else { struct io_ring_ctx *ctx = req->ctx; mutex_lock(&ctx->uring_lock); - __io_req_complete_post(req); + __io_req_complete_post(req, issue_flags & ~IO_URING_F_UNLOCKED); mutex_unlock(&ctx->uring_lock); } } @@ -997,7 +1046,7 @@ void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) void io_req_defer_failed(struct io_kiocb *req, s32 res) __must_hold(&ctx->uring_lock) { - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_cold_def *def = &io_cold_defs[req->opcode]; lockdep_assert_held(&req->ctx->uring_lock); @@ -1018,7 +1067,8 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) req->link = NULL; req->async_data = NULL; /* not necessary, but safer to zero */ - req->cqe.res = 0; + memset(&req->cqe, 0, sizeof(req->cqe)); + memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, @@ -1077,28 +1127,14 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) return true; } -static inline void io_dismantle_req(struct io_kiocb *req) -{ - unsigned int flags = req->flags; - - if (unlikely(flags & IO_REQ_CLEAN_FLAGS)) - io_clean_op(req); - if (!(flags & REQ_F_FIXED_FILE)) - io_put_file(req->file); -} - __cold void io_free_req(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - - io_req_put_rsrc(req); - io_dismantle_req(req); - io_put_task(req->task, 1); - - spin_lock(&ctx->completion_lock); - wq_list_add_head(&req->comp_list, &ctx->locked_free_list); - ctx->locked_free_nr++; - spin_unlock(&ctx->completion_lock); + /* refs were already put, restore them for io_req_task_complete() */ + req->flags &= ~REQ_F_REFCOUNT; + /* we only want to free it, don't post CQEs */ + req->flags |= REQ_F_CQE_SKIP; + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req); } static void __io_req_find_next_prep(struct io_kiocb *req) @@ -1127,27 +1163,28 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) return nxt; } -static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) +static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts) { if (!ctx) return; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - if (*locked) { + if (ts->locked) { io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); - *locked = false; + ts->locked = false; } percpu_ref_put(&ctx->refs); } static unsigned int handle_tw_list(struct llist_node *node, - struct io_ring_ctx **ctx, bool *locked, + struct io_ring_ctx **ctx, + struct io_tw_state *ts, struct llist_node *last) { unsigned int count = 0; - while (node != last) { + while (node && node != last) { struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); @@ -1155,18 +1192,19 @@ static unsigned int handle_tw_list(struct llist_node *node, prefetch(container_of(next, struct io_kiocb, io_task_work.node)); if (req->ctx != *ctx) { - ctx_flush_and_put(*ctx, locked); + ctx_flush_and_put(*ctx, ts); *ctx = req->ctx; /* if not contended, grab and improve batching */ - *locked = mutex_trylock(&(*ctx)->uring_lock); + ts->locked = mutex_trylock(&(*ctx)->uring_lock); percpu_ref_get(&(*ctx)->refs); - } else if (!*locked) - *locked = mutex_trylock(&(*ctx)->uring_lock); - req->io_task_work.func(req, locked); + } + INDIRECT_CALL_2(req->io_task_work.func, + io_poll_task_func, io_req_rw_complete, + req, ts); node = next; count++; if (unlikely(need_resched())) { - ctx_flush_and_put(*ctx, locked); + ctx_flush_and_put(*ctx, ts); *ctx = NULL; cond_resched(); } @@ -1205,93 +1243,131 @@ static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head, return cmpxchg(&head->first, old, new); } -void tctx_task_work(struct callback_head *cb) -{ - bool uring_locked = false; - struct io_ring_ctx *ctx = NULL; - struct io_uring_task *tctx = container_of(cb, struct io_uring_task, - task_work); - struct llist_node fake = {}; - struct llist_node *node; - unsigned int loops = 1; - unsigned int count; - - if (unlikely(current->flags & PF_EXITING)) { - io_fallback_tw(tctx); - return; - } - - node = io_llist_xchg(&tctx->task_list, &fake); - count = handle_tw_list(node, &ctx, &uring_locked, NULL); - node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); - while (node != &fake) { - loops++; - node = io_llist_xchg(&tctx->task_list, &fake); - count += handle_tw_list(node, &ctx, &uring_locked, &fake); - node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); - } - - ctx_flush_and_put(ctx, &uring_locked); - - /* relaxed read is enough as only the task itself sets ->in_idle */ - if (unlikely(atomic_read(&tctx->in_idle))) - io_uring_drop_tctx_refs(current); - - trace_io_uring_task_work_run(tctx, count, loops); -} - -static __cold void io_fallback_tw(struct io_uring_task *tctx) +static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync) { struct llist_node *node = llist_del_all(&tctx->task_list); + struct io_ring_ctx *last_ctx = NULL; struct io_kiocb *req; while (node) { req = container_of(node, struct io_kiocb, io_task_work.node); node = node->next; + if (sync && last_ctx != req->ctx) { + if (last_ctx) { + flush_delayed_work(&last_ctx->fallback_work); + percpu_ref_put(&last_ctx->refs); + } + last_ctx = req->ctx; + percpu_ref_get(&last_ctx->refs); + } if (llist_add(&req->io_task_work.node, &req->ctx->fallback_llist)) schedule_delayed_work(&req->ctx->fallback_work, 1); } + + if (last_ctx) { + flush_delayed_work(&last_ctx->fallback_work); + percpu_ref_put(&last_ctx->refs); + } } -static void io_req_local_work_add(struct io_kiocb *req) +void tctx_task_work(struct callback_head *cb) +{ + struct io_tw_state ts = {}; + struct io_ring_ctx *ctx = NULL; + struct io_uring_task *tctx = container_of(cb, struct io_uring_task, + task_work); + struct llist_node fake = {}; + struct llist_node *node; + unsigned int loops = 0; + unsigned int count = 0; + + if (unlikely(current->flags & PF_EXITING)) { + io_fallback_tw(tctx, true); + return; + } + + do { + loops++; + node = io_llist_xchg(&tctx->task_list, &fake); + count += handle_tw_list(node, &ctx, &ts, &fake); + + /* skip expensive cmpxchg if there are items in the list */ + if (READ_ONCE(tctx->task_list.first) != &fake) + continue; + if (ts.locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) { + io_submit_flush_completions(ctx); + if (READ_ONCE(tctx->task_list.first) != &fake) + continue; + } + node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); + } while (node != &fake); + + ctx_flush_and_put(ctx, &ts); + + /* relaxed read is enough as only the task itself sets ->in_cancel */ + if (unlikely(atomic_read(&tctx->in_cancel))) + io_uring_drop_tctx_refs(current); + + trace_io_uring_task_work_run(tctx, count, loops); +} + +static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags) { struct io_ring_ctx *ctx = req->ctx; + unsigned nr_wait, nr_tw, nr_tw_prev; + struct llist_node *first; - percpu_ref_get(&ctx->refs); + if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) + flags &= ~IOU_F_TWQ_LAZY_WAKE; - if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) { - percpu_ref_put(&ctx->refs); - return; + first = READ_ONCE(ctx->work_llist.first); + do { + nr_tw_prev = 0; + if (first) { + struct io_kiocb *first_req = container_of(first, + struct io_kiocb, + io_task_work.node); + /* + * Might be executed at any moment, rely on + * SLAB_TYPESAFE_BY_RCU to keep it alive. + */ + nr_tw_prev = READ_ONCE(first_req->nr_tw); + } + nr_tw = nr_tw_prev + 1; + /* Large enough to fail the nr_wait comparison below */ + if (!(flags & IOU_F_TWQ_LAZY_WAKE)) + nr_tw = -1U; + + req->nr_tw = nr_tw; + req->io_task_work.node.next = first; + } while (!try_cmpxchg(&ctx->work_llist.first, &first, + &req->io_task_work.node)); + + if (!first) { + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + if (ctx->has_evfd) + io_eventfd_signal(ctx); } - /* need it for the following io_cqring_wake() */ + + nr_wait = atomic_read(&ctx->cq_wait_nr); + /* no one is waiting */ + if (!nr_wait) + return; + /* either not enough or the previous add has already woken it up */ + if (nr_wait > nr_tw || nr_tw_prev >= nr_wait) + return; + /* pairs with set_current_state() in io_cqring_wait() */ smp_mb__after_atomic(); - - if (unlikely(atomic_read(&req->task->io_uring->in_idle))) { - io_move_task_work_from_local(ctx); - percpu_ref_put(&ctx->refs); - return; - } - - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - - if (ctx->has_evfd) - io_eventfd_signal(ctx); - __io_cqring_wake(ctx); - percpu_ref_put(&ctx->refs); + wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); } -void __io_req_task_work_add(struct io_kiocb *req, bool allow_local) +static void io_req_normal_work_add(struct io_kiocb *req) { struct io_uring_task *tctx = req->task->io_uring; struct io_ring_ctx *ctx = req->ctx; - if (allow_local && ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - io_req_local_work_add(req); - return; - } - /* task_work already pending, we're done */ if (!llist_add(&req->io_task_work.node, &tctx->task_list)) return; @@ -1302,7 +1378,18 @@ void __io_req_task_work_add(struct io_kiocb *req, bool allow_local) if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) return; - io_fallback_tw(tctx); + io_fallback_tw(tctx, false); +} + +void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) +{ + if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + rcu_read_lock(); + io_req_local_work_add(req, flags); + rcu_read_unlock(); + } else { + io_req_normal_work_add(req); + } } static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) @@ -1315,83 +1402,94 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) io_task_work.node); node = node->next; - __io_req_task_work_add(req, false); + io_req_normal_work_add(req); } } -int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked) +static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts) { struct llist_node *node; - struct llist_node fake; - struct llist_node *current_final = NULL; - int ret; - unsigned int loops = 1; + unsigned int loops = 0; + int ret = 0; - if (unlikely(ctx->submitter_task != current)) + if (WARN_ON_ONCE(ctx->submitter_task != current)) return -EEXIST; - - node = io_llist_xchg(&ctx->work_llist, &fake); - ret = 0; + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); again: - while (node != current_final) { + /* + * llists are in reverse order, flip it back the right way before + * running the pending items. + */ + node = llist_reverse_order(io_llist_xchg(&ctx->work_llist, NULL)); + while (node) { struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); prefetch(container_of(next, struct io_kiocb, io_task_work.node)); - req->io_task_work.func(req, locked); + INDIRECT_CALL_2(req->io_task_work.func, + io_poll_task_func, io_req_rw_complete, + req, ts); ret++; node = next; } + loops++; - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - - node = io_llist_cmpxchg(&ctx->work_llist, &fake, NULL); - if (node != &fake) { - loops++; - current_final = &fake; - node = io_llist_xchg(&ctx->work_llist, &fake); + if (!llist_empty(&ctx->work_llist)) goto again; - } - - if (*locked) + if (ts->locked) { io_submit_flush_completions(ctx); + if (!llist_empty(&ctx->work_llist)) + goto again; + } trace_io_uring_local_work_run(ctx, ret, loops); return ret; - } -int io_run_local_work(struct io_ring_ctx *ctx) +static inline int io_run_local_work_locked(struct io_ring_ctx *ctx) { - bool locked; + struct io_tw_state ts = { .locked = true, }; int ret; if (llist_empty(&ctx->work_llist)) return 0; - __set_current_state(TASK_RUNNING); - locked = mutex_trylock(&ctx->uring_lock); - ret = __io_run_local_work(ctx, &locked); - if (locked) + ret = __io_run_local_work(ctx, &ts); + /* shouldn't happen! */ + if (WARN_ON_ONCE(!ts.locked)) + mutex_lock(&ctx->uring_lock); + return ret; +} + +static int io_run_local_work(struct io_ring_ctx *ctx) +{ + struct io_tw_state ts = {}; + int ret; + + ts.locked = mutex_trylock(&ctx->uring_lock); + ret = __io_run_local_work(ctx, &ts); + if (ts.locked) mutex_unlock(&ctx->uring_lock); return ret; } -static void io_req_task_cancel(struct io_kiocb *req, bool *locked) +static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts) { - io_tw_lock(req->ctx, locked); + io_tw_lock(req->ctx, ts); io_req_defer_failed(req, req->cqe.res); } -void io_req_task_submit(struct io_kiocb *req, bool *locked) +void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts) { - io_tw_lock(req->ctx, locked); + io_tw_lock(req->ctx, ts); /* req->task == current here, checking PF_EXITING is safe */ - if (likely(!(req->task->flags & PF_EXITING))) - io_queue_sqe(req); - else + if (unlikely(req->task->flags & PF_EXITING)) io_req_defer_failed(req, -EFAULT); + else if (req->flags & REQ_F_FORCE_ASYNC) + io_queue_iowq(req, ts); + else + io_queue_sqe(req); } void io_req_task_queue_fail(struct io_kiocb *req, int ret) @@ -1415,12 +1513,10 @@ void io_queue_next(struct io_kiocb *req) io_req_task_queue(nxt); } -void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) +static void io_free_batch_list(struct io_ring_ctx *ctx, + struct io_wq_work_node *node) __must_hold(&ctx->uring_lock) { - struct task_struct *task = NULL; - int task_refs = 0; - do { struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); @@ -1445,43 +1541,33 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) io_clean_op(req); } - if (!(req->flags & REQ_F_FIXED_FILE)) - io_put_file(req->file); + io_put_file(req); io_req_put_rsrc_locked(req, ctx); - if (req->task != task) { - if (task) - io_put_task(task, task_refs); - task = req->task; - task_refs = 0; - } - task_refs++; + io_put_task(req->task); node = req->comp_list.next; io_req_add_to_cache(req, ctx); } while (node); - - if (task) - io_put_task(task, task_refs); } -static void __io_submit_flush_completions(struct io_ring_ctx *ctx) +void __io_submit_flush_completions(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { - struct io_wq_work_node *node, *prev; struct io_submit_state *state = &ctx->submit_state; + struct io_wq_work_node *node; __io_cq_lock(ctx); /* must come first to preserve CQE ordering in failure cases */ if (state->cqes_count) __io_flush_post_cqes(ctx); - wq_list_for_each(node, prev, &state->compl_reqs) { + __wq_list_for_each(node, &state->compl_reqs) { struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); if (!(req->flags & REQ_F_CQE_SKIP) && - unlikely(!__io_fill_cqe_req(ctx, req))) { - if (ctx->task_complete) { + unlikely(!io_fill_cqe_req(ctx, req))) { + if (ctx->lockless_cq) { spin_lock(&ctx->completion_lock); io_req_cqe_overflow(req); spin_unlock(&ctx->completion_lock); @@ -1498,22 +1584,6 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) } } -/* - * Drop reference to request, return next in chain (if there is one) if this - * was the last reference to this request. - */ -static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) -{ - struct io_kiocb *nxt = NULL; - - if (req_ref_put_and_test(req)) { - if (unlikely(req->flags & IO_REQ_LINK_FLAGS)) - nxt = io_req_find_next(req); - io_free_req(req); - } - return nxt; -} - static unsigned io_cqring_events(struct io_ring_ctx *ctx) { /* See comment at the top of this file */ @@ -1552,7 +1622,6 @@ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) static int io_iopoll_check(struct io_ring_ctx *ctx, long min) { unsigned int nr_events = 0; - int ret = 0; unsigned long check_cq; if (!io_allowed_run_tw(ctx)) @@ -1578,6 +1647,8 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) return 0; do { + int ret = 0; + /* * If a submit got punted to a workqueue, we can have the * application entering polling for a command before it gets @@ -1606,18 +1677,23 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) break; } ret = io_do_iopoll(ctx, !min); - if (ret < 0) - break; - nr_events += ret; - ret = 0; - } while (nr_events < min && !need_resched()); + if (unlikely(ret < 0)) + return ret; - return ret; + if (task_sigpending(current)) + return -EINTR; + if (need_resched()) + break; + + nr_events += ret; + } while (nr_events < min); + + return 0; } -void io_req_task_complete(struct io_kiocb *req, bool *locked) +void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts) { - if (*locked) + if (ts->locked) io_req_complete_defer(req); else io_req_complete_post(req, IO_URING_F_UNLOCKED); @@ -1678,61 +1754,21 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) } } -static bool io_bdev_nowait(struct block_device *bdev) -{ - return !bdev || bdev_nowait(bdev); -} - -/* - * If we tracked the file through the SCM inflight mechanism, we could support - * any file. For now, just ensure that anything potentially problematic is done - * inline. - */ -static bool __io_file_supports_nowait(struct file *file, umode_t mode) -{ - if (S_ISBLK(mode)) { - if (IS_ENABLED(CONFIG_BLOCK) && - io_bdev_nowait(I_BDEV(file->f_mapping->host))) - return true; - return false; - } - if (S_ISSOCK(mode)) - return true; - if (S_ISREG(mode)) { - if (IS_ENABLED(CONFIG_BLOCK) && - io_bdev_nowait(file->f_inode->i_sb->s_bdev) && - !io_is_uring_fops(file)) - return true; - return false; - } - - /* any ->read/write should understand O_NONBLOCK */ - if (file->f_flags & O_NONBLOCK) - return true; - return file->f_mode & FMODE_NOWAIT; -} - -/* - * If we tracked the file through the SCM inflight mechanism, we could support - * any file. For now, just ensure that anything potentially problematic is done - * inline. - */ unsigned int io_file_get_flags(struct file *file) { - umode_t mode = file_inode(file)->i_mode; unsigned int res = 0; - if (S_ISREG(mode)) - res |= FFS_ISREG; - if (__io_file_supports_nowait(file, mode)) - res |= FFS_NOWAIT; + if (S_ISREG(file_inode(file)->i_mode)) + res |= REQ_F_ISREG; + if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT)) + res |= REQ_F_SUPPORT_NOWAIT; return res; } bool io_alloc_async_data(struct io_kiocb *req) { - WARN_ON_ONCE(!io_op_defs[req->opcode].async_size); - req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL); + WARN_ON_ONCE(!io_cold_defs[req->opcode].async_size); + req->async_data = kmalloc(io_cold_defs[req->opcode].async_size, GFP_KERNEL); if (req->async_data) { req->flags |= REQ_F_ASYNC_DATA; return false; @@ -1742,20 +1778,21 @@ bool io_alloc_async_data(struct io_kiocb *req) int io_req_prep_async(struct io_kiocb *req) { - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_cold_def *cdef = &io_cold_defs[req->opcode]; + const struct io_issue_def *def = &io_issue_defs[req->opcode]; /* assign early for deferred execution for non-fixed file */ if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE) && !req->file) req->file = io_file_get_normal(req, req->cqe.fd); - if (!def->prep_async) + if (!cdef->prep_async) return 0; if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; - if (!io_op_defs[req->opcode].manual_alloc) { + if (!def->manual_alloc) { if (io_alloc_async_data(req)) return -EAGAIN; } - return def->prep_async(req); + return cdef->prep_async(req); } static u32 io_get_sequence(struct io_kiocb *req) @@ -1810,42 +1847,10 @@ queue: spin_unlock(&ctx->completion_lock); } -static void io_clean_op(struct io_kiocb *req) +static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, + unsigned int issue_flags) { - if (req->flags & REQ_F_BUFFER_SELECTED) { - spin_lock(&req->ctx->completion_lock); - io_put_kbuf_comp(req); - spin_unlock(&req->ctx->completion_lock); - } - - if (req->flags & REQ_F_NEED_CLEANUP) { - const struct io_op_def *def = &io_op_defs[req->opcode]; - - if (def->cleanup) - def->cleanup(req); - } - if ((req->flags & REQ_F_POLLED) && req->apoll) { - kfree(req->apoll->double_poll); - kfree(req->apoll); - req->apoll = NULL; - } - if (req->flags & REQ_F_INFLIGHT) { - struct io_uring_task *tctx = req->task->io_uring; - - atomic_dec(&tctx->inflight_tracked); - } - if (req->flags & REQ_F_CREDS) - put_cred(req->creds); - if (req->flags & REQ_F_ASYNC_DATA) { - kfree(req->async_data); - req->async_data = NULL; - } - req->flags &= ~IO_REQ_CLEAN_FLAGS; -} - -static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags) -{ - if (req->file || !io_op_defs[req->opcode].needs_file) + if (req->file || !def->needs_file) return true; if (req->flags & REQ_F_FIXED_FILE) @@ -1858,11 +1863,11 @@ static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags) static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) { - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_issue_def *def = &io_issue_defs[req->opcode]; const struct cred *creds = NULL; int ret; - if (unlikely(!io_assign_file(req, issue_flags))) + if (unlikely(!io_assign_file(req, def, issue_flags))) return -EBADF; if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) @@ -1894,9 +1899,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) return 0; } -int io_poll_issue(struct io_kiocb *req, bool *locked) +int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts) { - io_tw_lock(req->ctx, locked); + io_tw_lock(req->ctx, ts); return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT| IO_URING_F_COMPLETE_DEFER); } @@ -1904,15 +1909,20 @@ int io_poll_issue(struct io_kiocb *req, bool *locked) struct io_wq_work *io_wq_free_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; - req = io_put_req_find_next(req); - return req ? &req->work : NULL; + if (req_ref_put_and_test(req)) { + if (req->flags & IO_REQ_LINK_FLAGS) + nxt = io_req_find_next(req); + io_free_req(req); + } + return nxt ? &nxt->work : NULL; } void io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_issue_def *def = &io_issue_defs[req->opcode]; unsigned int issue_flags = IO_URING_F_UNLOCKED | IO_URING_F_IOWQ; bool needs_poll = false; int ret = 0, err = -ECANCELED; @@ -1931,7 +1941,7 @@ fail: io_req_task_queue_fail(req, err); return; } - if (!io_assign_file(req, issue_flags)) { + if (!io_assign_file(req, def, issue_flags)) { err = -EBADF; work->flags |= IO_WQ_WORK_CANCEL; goto fail; @@ -1966,6 +1976,8 @@ fail: if (!needs_poll) { if (!(req->ctx->flags & IORING_SETUP_IOPOLL)) break; + if (io_wq_worker_stopped()) + break; cond_resched(); continue; } @@ -1986,19 +1998,17 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; + struct io_fixed_file *slot; struct file *file = NULL; - unsigned long file_ptr; io_ring_submit_lock(ctx, issue_flags); if (unlikely((unsigned int)fd >= ctx->nr_user_files)) goto out; fd = array_index_nospec(fd, ctx->nr_user_files); - file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; - file = (struct file *) (file_ptr & FFS_MASK); - file_ptr &= ~FFS_MASK; - /* mask in overlapping REQ_F and FFS bits */ - req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT); + slot = io_fixed_file_slot(&ctx->file_table, fd); + file = io_slot_file(slot); + req->flags |= io_slot_flags(slot); io_req_set_rsrc_node(req, ctx, 0); out: io_ring_submit_unlock(ctx, issue_flags); @@ -2135,7 +2145,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) __must_hold(&ctx->uring_lock) { - const struct io_op_def *def; + const struct io_issue_def *def; unsigned int sqe_flags; int personality; u8 opcode; @@ -2153,7 +2163,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->opcode = 0; return -EINVAL; } - def = &io_op_defs[opcode]; + def = &io_issue_defs[opcode]; if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { /* enforce forwards compatibility on users */ if (sqe_flags & ~SQE_VALID_FLAGS) @@ -2273,8 +2283,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, if (unlikely(ret)) return io_submit_fail_init(sqe, req, ret); - /* don't need @sqe from now on */ - trace_io_uring_submit_sqe(req, true); + trace_io_uring_submit_req(req); /* * If we already have a head request, queue this one for async @@ -2364,10 +2373,23 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) * used, it's important that those reads are done through READ_ONCE() to * prevent a re-load down the line. */ -static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) +static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) { - unsigned head, mask = ctx->sq_entries - 1; - unsigned sq_idx = ctx->cached_sq_head++ & mask; + unsigned mask = ctx->sq_entries - 1; + unsigned head = ctx->cached_sq_head++ & mask; + + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) { + head = READ_ONCE(ctx->sq_array[head]); + if (unlikely(head >= ctx->sq_entries)) { + /* drop invalid entries */ + spin_lock(&ctx->completion_lock); + ctx->cq_extra--; + spin_unlock(&ctx->completion_lock); + WRITE_ONCE(ctx->rings->sq_dropped, + READ_ONCE(ctx->rings->sq_dropped) + 1); + return false; + } + } /* * The cached sq head (or cq tail) serves two purposes: @@ -2377,19 +2399,12 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) * 2) allows the kernel side to track the head on its own, even * though the application is the one updating it. */ - head = READ_ONCE(ctx->sq_array[sq_idx]); - if (likely(head < ctx->sq_entries)) { - /* double index for 128-byte SQEs, twice as long */ - if (ctx->flags & IORING_SETUP_SQE128) - head <<= 1; - return &ctx->sq_sqes[head]; - } - /* drop invalid entries */ - ctx->cq_extra--; - WRITE_ONCE(ctx->rings->sq_dropped, - READ_ONCE(ctx->rings->sq_dropped) + 1); - return NULL; + /* double index for 128-byte SQEs, twice as long */ + if (ctx->flags & IORING_SETUP_SQE128) + head <<= 1; + *sqe = &ctx->sq_sqes[head]; + return true; } int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) @@ -2402,7 +2417,7 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) if (unlikely(!entries)) return 0; /* make sure SQ entry isn't read before tail */ - ret = left = min3(nr, ctx->sq_entries, entries); + ret = left = min(nr, entries); io_get_task_refs(left); io_submit_state_start(&ctx->submit_state, left); @@ -2410,11 +2425,9 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) const struct io_uring_sqe *sqe; struct io_kiocb *req; - if (unlikely(!io_alloc_req_refill(ctx))) + if (unlikely(!io_alloc_req(ctx, &req))) break; - req = io_alloc_req(ctx); - sqe = io_get_sqe(ctx); - if (unlikely(!sqe)) { + if (unlikely(!io_get_sqe(ctx, &sqe))) { io_req_add_to_cache(req, ctx); break; } @@ -2449,13 +2462,13 @@ struct io_wait_queue { struct io_ring_ctx *ctx; unsigned cq_tail; unsigned nr_timeouts; + ktime_t timeout; }; static inline bool io_has_work(struct io_ring_ctx *ctx) { return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) || - ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && - !llist_empty(&ctx->work_llist)); + !llist_empty(&ctx->work_llist); } static inline bool io_should_wake(struct io_wait_queue *iowq) @@ -2474,59 +2487,72 @@ static inline bool io_should_wake(struct io_wait_queue *iowq) static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, int wake_flags, void *key) { - struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, - wq); - struct io_ring_ctx *ctx = iowq->ctx; + struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, wq); /* * Cannot safely flush overflowed CQEs from here, ensure we wake up * the task, and the next invocation will do it. */ - if (io_should_wake(iowq) || io_has_work(ctx)) + if (io_should_wake(iowq) || io_has_work(iowq->ctx)) return autoremove_wake_function(curr, mode, wake_flags, key); return -1; } int io_run_task_work_sig(struct io_ring_ctx *ctx) { - if (io_run_task_work_ctx(ctx) > 0) - return 1; + if (!llist_empty(&ctx->work_llist)) { + __set_current_state(TASK_RUNNING); + if (io_run_local_work(ctx) > 0) + return 0; + } + if (io_run_task_work() > 0) + return 0; if (task_sigpending(current)) return -EINTR; return 0; } +static bool current_pending_io(void) +{ + struct io_uring_task *tctx = current->io_uring; + + if (!tctx) + return false; + return percpu_counter_read_positive(&tctx->inflight); +} + /* when returns >0, the caller should retry */ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, - struct io_wait_queue *iowq, - ktime_t *timeout) + struct io_wait_queue *iowq) { - int ret; - unsigned long check_cq; + int io_wait, ret; - /* make sure we run task_work before checking for signals */ - ret = io_run_task_work_sig(ctx); - if (ret || io_should_wake(iowq)) - return ret; - - check_cq = READ_ONCE(ctx->check_cq); - if (unlikely(check_cq)) { - /* let the caller flush overflows, retry */ - if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) - return 1; - if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) - return -EBADR; - } - if (!schedule_hrtimeout(timeout, HRTIMER_MODE_ABS)) - return -ETIME; + if (unlikely(READ_ONCE(ctx->check_cq))) + return 1; + if (unlikely(!llist_empty(&ctx->work_llist))) + return 1; + if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) + return 1; + if (unlikely(task_sigpending(current))) + return -EINTR; + if (unlikely(io_should_wake(iowq))) + return 0; /* - * Run task_work after scheduling. If we got woken because of - * task_work being processed, run it now rather than let the caller - * do another wait loop. + * Mark us as being in io_wait if we have pending requests, so cpufreq + * can take into account that the task is waiting for IO - turns out + * to be important for low QD IO. */ - ret = io_run_task_work_sig(ctx); - return ret < 0 ? ret : 1; + io_wait = current->in_iowait; + if (current_pending_io()) + current->in_iowait = 1; + ret = 0; + if (iowq->timeout == KTIME_MAX) + schedule(); + else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS)) + ret = -ETIME; + current->in_iowait = io_wait; + return ret; } /* @@ -2539,23 +2565,17 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, { struct io_wait_queue iowq; struct io_rings *rings = ctx->rings; - ktime_t timeout = KTIME_MAX; int ret; if (!io_allowed_run_tw(ctx)) return -EEXIST; - - do { - /* always run at least 1 task work to process local work */ - ret = io_run_task_work_ctx(ctx); - if (ret < 0) - return ret; - io_cqring_overflow_flush(ctx); - - /* if user messes with these they will just get an early return */ - if (__io_cqring_events_user(ctx) >= min_events) - return 0; - } while (ret > 0); + if (!llist_empty(&ctx->work_llist)) + io_run_local_work(ctx); + io_run_task_work(); + io_cqring_overflow_flush(ctx); + /* if user messes with these they will just get an early return */ + if (__io_cqring_events_user(ctx) >= min_events) + return 0; if (sig) { #ifdef CONFIG_COMPAT @@ -2570,36 +2590,71 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ret; } - if (uts) { - struct timespec64 ts; - - if (get_timespec64(&ts, uts)) - return -EFAULT; - timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); - } - init_waitqueue_func_entry(&iowq.wq, io_wake_function); iowq.wq.private = current; INIT_LIST_HEAD(&iowq.wq.entry); iowq.ctx = ctx; iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; + iowq.timeout = KTIME_MAX; + + if (uts) { + struct timespec64 ts; + + if (get_timespec64(&ts, uts)) + return -EFAULT; + iowq.timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); + } trace_io_uring_cqring_wait(ctx, min_events); do { - if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { - finish_wait(&ctx->cq_wait, &iowq.wq); - io_cqring_do_overflow_flush(ctx); - } - prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, - TASK_INTERRUPTIBLE); - ret = io_cqring_wait_schedule(ctx, &iowq, &timeout); - if (__io_cqring_events_user(ctx) >= min_events) - break; - cond_resched(); - } while (ret > 0); + unsigned long check_cq; - finish_wait(&ctx->cq_wait, &iowq.wq); + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail); + + atomic_set(&ctx->cq_wait_nr, nr_wait); + set_current_state(TASK_INTERRUPTIBLE); + } else { + prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, + TASK_INTERRUPTIBLE); + } + + ret = io_cqring_wait_schedule(ctx, &iowq); + __set_current_state(TASK_RUNNING); + atomic_set(&ctx->cq_wait_nr, 0); + + if (ret < 0) + break; + /* + * Run task_work after scheduling and before io_should_wake(). + * If we got woken because of task_work being processed, run it + * now rather than let the caller do another wait loop. + */ + io_run_task_work(); + if (!llist_empty(&ctx->work_llist)) + io_run_local_work(ctx); + + check_cq = READ_ONCE(ctx->check_cq); + if (unlikely(check_cq)) { + /* let the caller flush overflows, retry */ + if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) + io_cqring_do_overflow_flush(ctx); + if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) { + ret = -EBADR; + break; + } + } + + if (io_should_wake(&iowq)) { + ret = 0; + break; + } + cond_resched(); + } while (1); + + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + finish_wait(&ctx->cq_wait, &iowq.wq); restore_saved_sigmask_unless(ret == -EINTR); return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; @@ -2607,21 +2662,122 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, static void io_mem_free(void *ptr) { - struct page *page; - if (!ptr) return; - page = virt_to_head_page(ptr); - if (put_page_testzero(page)) - free_compound_page(page); + folio_put(virt_to_folio(ptr)); +} + +static void io_pages_free(struct page ***pages, int npages) +{ + struct page **page_array; + int i; + + if (!pages) + return; + + page_array = *pages; + if (!page_array) + return; + + for (i = 0; i < npages; i++) + unpin_user_page(page_array[i]); + kvfree(page_array); + *pages = NULL; +} + +static void *__io_uaddr_map(struct page ***pages, unsigned short *npages, + unsigned long uaddr, size_t size) +{ + struct page **page_array; + unsigned int nr_pages; + int ret, i; + + *npages = 0; + + if (uaddr & (PAGE_SIZE - 1) || !size) + return ERR_PTR(-EINVAL); + + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (nr_pages > USHRT_MAX) + return ERR_PTR(-EINVAL); + page_array = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!page_array) + return ERR_PTR(-ENOMEM); + + ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, + page_array); + if (ret != nr_pages) { +err: + io_pages_free(&page_array, ret > 0 ? ret : 0); + return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT); + } + /* + * Should be a single page. If the ring is small enough that we can + * use a normal page, that is fine. If we need multiple pages, then + * userspace should use a huge page. That's the only way to guarantee + * that we get contigious memory, outside of just being lucky or + * (currently) having low memory fragmentation. + */ + if (page_array[0] != page_array[ret - 1]) + goto err; + + /* + * Can't support mapping user allocated ring memory on 32-bit archs + * where it could potentially reside in highmem. Just fail those with + * -EINVAL, just like we did on kernels that didn't support this + * feature. + */ + for (i = 0; i < nr_pages; i++) { + if (PageHighMem(page_array[i])) { + ret = -EINVAL; + goto err; + } + } + + *pages = page_array; + *npages = nr_pages; + return page_to_virt(page_array[0]); +} + +static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, + size_t size) +{ + return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr, + size); +} + +static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, + size_t size) +{ + return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr, + size); +} + +static void io_rings_free(struct io_ring_ctx *ctx) +{ + if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { + io_mem_free(ctx->rings); + io_mem_free(ctx->sq_sqes); + ctx->rings = NULL; + ctx->sq_sqes = NULL; + } else { + io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); + ctx->n_ring_pages = 0; + io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages); + ctx->n_sqe_pages = 0; + } } static void *io_mem_alloc(size_t size) { gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; + void *ret; - return (void *) __get_free_pages(gfp, get_order(size)); + ret = (void *) __get_free_pages(gfp, get_order(size)); + if (ret) + return ret; + return ERR_PTR(-ENOMEM); } static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, @@ -2644,6 +2800,12 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries return SIZE_MAX; #endif + if (ctx->flags & IORING_SETUP_NO_SQARRAY) { + if (sq_offset) + *sq_offset = SIZE_MAX; + return off; + } + if (sq_offset) *sq_offset = off; @@ -2714,14 +2876,14 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx) static void io_req_caches_free(struct io_ring_ctx *ctx) { + struct io_kiocb *req; int nr = 0; mutex_lock(&ctx->uring_lock); io_flush_cached_locked_reqs(ctx, &ctx->submit_state); while (!io_req_cache_empty(ctx)) { - struct io_kiocb *req = io_alloc_req(ctx); - + req = io_extract_req(ctx); kmem_cache_free(req_cachep, req); nr++; } @@ -2730,13 +2892,17 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } +static void io_rsrc_node_cache_free(struct io_cache_entry *entry) +{ + kfree(container_of(entry, struct io_rsrc_node, cache)); +} + static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); - io_rsrc_refs_drop(ctx); /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */ - io_wait_rsrc_data(ctx->buf_data); - io_wait_rsrc_data(ctx->file_data); + if (WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list))) + return; mutex_lock(&ctx->uring_lock); if (ctx->buf_data) @@ -2756,14 +2922,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) /* there are no registered resources left, nobody uses it */ if (ctx->rsrc_node) - io_rsrc_node_destroy(ctx->rsrc_node); - if (ctx->rsrc_backup_node) - io_rsrc_node_destroy(ctx->rsrc_backup_node); - flush_delayed_work(&ctx->rsrc_put_work); - flush_delayed_work(&ctx->fallback_work); + io_rsrc_node_destroy(ctx, ctx->rsrc_node); WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); - WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist)); #if defined(CONFIG_UNIX) if (ctx->ring_sock) { @@ -2773,12 +2934,12 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) #endif WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); + io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free); if (ctx->mm_account) { mmdrop(ctx->mm_account); ctx->mm_account = NULL; } - io_mem_free(ctx->rings); - io_mem_free(ctx->sq_sqes); + io_rings_free(ctx); percpu_ref_exit(&ctx->refs); free_uid(ctx->user); @@ -2787,18 +2948,59 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_wq_put_hash(ctx->hash_map); kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table_locked.hbs); - kfree(ctx->dummy_ubuf); kfree(ctx->io_bl); xa_destroy(&ctx->io_bl_xa); kfree(ctx); } +static __cold void io_activate_pollwq_cb(struct callback_head *cb) +{ + struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx, + poll_wq_task_work); + + mutex_lock(&ctx->uring_lock); + ctx->poll_activated = true; + mutex_unlock(&ctx->uring_lock); + + /* + * Wake ups for some events between start of polling and activation + * might've been lost due to loose synchronisation. + */ + wake_up_all(&ctx->poll_wq); + percpu_ref_put(&ctx->refs); +} + +static __cold void io_activate_pollwq(struct io_ring_ctx *ctx) +{ + spin_lock(&ctx->completion_lock); + /* already activated or in progress */ + if (ctx->poll_activated || ctx->poll_wq_task_work.func) + goto out; + if (WARN_ON_ONCE(!ctx->task_complete)) + goto out; + if (!ctx->submitter_task) + goto out; + /* + * with ->submitter_task only the submitter task completes requests, we + * only need to sync with it, which is done by injecting a tw + */ + init_task_work(&ctx->poll_wq_task_work, io_activate_pollwq_cb); + percpu_ref_get(&ctx->refs); + if (task_work_add(ctx->submitter_task, &ctx->poll_wq_task_work, TWA_SIGNAL)) + percpu_ref_put(&ctx->refs); +out: + spin_unlock(&ctx->completion_lock); +} + static __poll_t io_uring_poll(struct file *file, poll_table *wait) { struct io_ring_ctx *ctx = file->private_data; __poll_t mask = 0; - poll_wait(file, &ctx->cq_wait, wait); + if (unlikely(!ctx->poll_activated)) + io_activate_pollwq(ctx); + + poll_wait(file, &ctx->poll_wq, wait); /* * synchronizes with barrier from wq_has_sleeper call in * io_commit_cqring @@ -2853,12 +3055,12 @@ static __cold void io_tctx_exit_cb(struct callback_head *cb) work = container_of(cb, struct io_tctx_exit, task_work); /* - * When @in_idle, we're in cancellation and it's racy to remove the + * When @in_cancel, we're in cancellation and it's racy to remove the * node. It'll be removed by the end of cancellation, just ignore it. * tctx can be NULL if the queueing of this task_work raced with * work cancelation off the exec path. */ - if (tctx && !atomic_read(&tctx->in_idle)) + if (tctx && !atomic_read(&tctx->in_cancel)) io_uring_del_tctx_node((unsigned long)work->ctx); complete(&work->completion); } @@ -2963,6 +3165,10 @@ static __cold void io_ring_exit_work(struct work_struct *work) spin_lock(&ctx->completion_lock); spin_unlock(&ctx->completion_lock); + /* pairs with RCU read section in io_req_local_work_add() */ + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) + synchronize_rcu(); + io_ring_ctx_free(ctx); } @@ -2986,6 +3192,8 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) if (ctx->rings) io_kill_timeouts(ctx, NULL, true); + flush_delayed_work(&ctx->fallback_work); + INIT_WORK(&ctx->exit_work, io_ring_exit_work); /* * Use system_unbound_wq to avoid spawning tons of event kworkers @@ -3078,6 +3286,12 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, enum io_wq_cancel cret; bool ret = false; + /* set it so io_req_local_work_add() would wake us up */ + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + atomic_set(&ctx->cq_wait_nr, 1); + smp_mb(); + } + /* failed during ring init, it couldn't have issued any requests */ if (!ctx->rings) return false; @@ -3104,7 +3318,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, } } - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) + if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && + io_allowed_defer_tw_run(ctx)) ret |= io_run_local_work(ctx) > 0; ret |= io_cancel_defer_files(ctx, task, cancel_all); mutex_lock(&ctx->uring_lock); @@ -3131,6 +3346,8 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) { struct io_uring_task *tctx = current->io_uring; struct io_ring_ctx *ctx; + struct io_tctx_node *node; + unsigned long index; s64 inflight; DEFINE_WAIT(wait); @@ -3141,7 +3358,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) if (tctx->io_wq) io_wq_exit_start(tctx->io_wq); - atomic_inc(&tctx->in_idle); + atomic_inc(&tctx->in_cancel); do { bool loop = false; @@ -3152,9 +3369,6 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) break; if (!sqd) { - struct io_tctx_node *node; - unsigned long index; - xa_for_each(&tctx->xa, index, node) { /* sqpoll task will cancel all its requests */ if (node->ctx->sq_data) @@ -3177,7 +3391,13 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE); io_run_task_work(); io_uring_drop_tctx_refs(current); - + xa_for_each(&tctx->xa, index, node) { + if (!llist_empty(&node->ctx->work_llist)) { + WARN_ON_ONCE(node->ctx->submitter_task && + node->ctx->submitter_task != current); + goto end_wait; + } + } /* * If we've seen completions, retry without waiting. This * avoids a race where a completion comes in before we did @@ -3185,6 +3405,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) */ if (inflight == tctx_inflight(tctx, !cancel_all)) schedule(); +end_wait: finish_wait(&tctx->wait, &wait); } while (1); @@ -3192,9 +3413,9 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) if (cancel_all) { /* * We shouldn't run task_works after cancel, so just leave - * ->in_idle set for normal exit. + * ->in_cancel set for normal exit. */ - atomic_dec(&tctx->in_idle); + atomic_dec(&tctx->in_cancel); /* for exec all current's requests should be gone, kill tctx */ __io_uring_free(current); } @@ -3213,7 +3434,11 @@ static void *io_uring_validate_mmap_request(struct file *file, struct page *page; void *ptr; - switch (offset) { + /* Don't allow mmap if the ring was setup without it */ + if (ctx->flags & IORING_SETUP_NO_MMAP) + return ERR_PTR(-EINVAL); + + switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: ptr = ctx->rings; @@ -3221,6 +3446,17 @@ static void *io_uring_validate_mmap_request(struct file *file, case IORING_OFF_SQES: ptr = ctx->sq_sqes; break; + case IORING_OFF_PBUF_RING: { + unsigned int bgid; + + bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; + mutex_lock(&ctx->uring_lock); + ptr = io_pbuf_get_address(ctx, bgid); + mutex_unlock(&ctx->uring_lock); + if (!ptr) + return ERR_PTR(-EINVAL); + break; + } default: return ERR_PTR(-EINVAL); } @@ -3248,11 +3484,54 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); } +static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + void *ptr; + + /* + * Do not allow to map to user-provided address to avoid breaking the + * aliasing rules. Userspace is not able to guess the offset address of + * kernel kmalloc()ed memory area. + */ + if (addr) + return -EINVAL; + + ptr = io_uring_validate_mmap_request(filp, pgoff, len); + if (IS_ERR(ptr)) + return -ENOMEM; + + /* + * Some architectures have strong cache aliasing requirements. + * For such architectures we need a coherent mapping which aliases + * kernel memory *and* userspace memory. To achieve that: + * - use a NULL file pointer to reference physical memory, and + * - use the kernel virtual address of the shared io_uring context + * (instead of the userspace-provided address, which has to be 0UL + * anyway). + * - use the same pgoff which the get_unmapped_area() uses to + * calculate the page colouring. + * For architectures without such aliasing requirements, the + * architecture will return any suitable mapping because addr is 0. + */ + filp = NULL; + flags |= MAP_SHARED; + pgoff = 0; /* has been translated to ptr above */ +#ifdef SHM_COLOUR + addr = (uintptr_t) ptr; + pgoff = addr >> PAGE_SHIFT; +#else + addr = 0UL; +#endif + return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); +} + #else /* !CONFIG_MMU */ static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { - return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -EINVAL; + return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL; } static unsigned int io_uring_nommu_mmap_capabilities(struct file *file) @@ -3376,11 +3655,9 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); - if (flags & IORING_ENTER_SQ_WAIT) { - ret = io_sqpoll_wait_sq(ctx); - if (ret) - goto out; - } + if (flags & IORING_ENTER_SQ_WAIT) + io_sqpoll_wait_sq(ctx); + ret = to_submit; } else if (to_submit) { ret = io_uring_add_tctx_node(ctx); @@ -3462,6 +3739,8 @@ static const struct file_operations io_uring_fops = { #ifndef CONFIG_MMU .get_unmapped_area = io_uring_nommu_get_unmapped_area, .mmap_capabilities = io_uring_nommu_mmap_capabilities, +#else + .get_unmapped_area = io_uring_mmu_get_unmapped_area, #endif .poll = io_uring_poll, #ifdef CONFIG_PROC_FS @@ -3479,6 +3758,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, { struct io_rings *rings; size_t size, sq_array_offset; + void *ptr; /* make sure these are sane, as we already accounted them */ ctx->sq_entries = p->sq_entries; @@ -3488,12 +3768,17 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, if (size == SIZE_MAX) return -EOVERFLOW; - rings = io_mem_alloc(size); - if (!rings) - return -ENOMEM; + if (!(ctx->flags & IORING_SETUP_NO_MMAP)) + rings = io_mem_alloc(size); + else + rings = io_rings_map(ctx, p->cq_off.user_addr, size); + + if (IS_ERR(rings)) + return PTR_ERR(rings); ctx->rings = rings; - ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) + ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); rings->sq_ring_mask = p->sq_entries - 1; rings->cq_ring_mask = p->cq_entries - 1; rings->sq_ring_entries = p->sq_entries; @@ -3504,34 +3789,31 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, else size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); if (size == SIZE_MAX) { - io_mem_free(ctx->rings); - ctx->rings = NULL; + io_rings_free(ctx); return -EOVERFLOW; } - ctx->sq_sqes = io_mem_alloc(size); - if (!ctx->sq_sqes) { - io_mem_free(ctx->rings); - ctx->rings = NULL; - return -ENOMEM; + if (!(ctx->flags & IORING_SETUP_NO_MMAP)) + ptr = io_mem_alloc(size); + else + ptr = io_sqes_map(ctx, p->sq_off.user_addr, size); + + if (IS_ERR(ptr)) { + io_rings_free(ctx); + return PTR_ERR(ptr); } + ctx->sq_sqes = ptr; return 0; } -static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) +static int io_uring_install_fd(struct file *file) { - int ret, fd; + int fd; fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); if (fd < 0) return fd; - - ret = __io_uring_add_tctx_node(ctx); - if (ret) { - put_unused_fd(fd); - return ret; - } fd_install(fd, file); return fd; } @@ -3572,6 +3854,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, struct io_uring_params __user *params) { struct io_ring_ctx *ctx; + struct io_uring_task *tctx; struct file *file; int ret; @@ -3583,6 +3866,10 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, entries = IORING_MAX_ENTRIES; } + if ((p->flags & IORING_SETUP_REGISTERED_FD_ONLY) + && !(p->flags & IORING_SETUP_NO_MMAP)) + return -EINVAL; + /* * Use twice as many entries for the CQ ring. It's possible for the * application to drive a higher depth than the size of the SQ ring, @@ -3621,6 +3908,16 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, !(ctx->flags & IORING_SETUP_SQPOLL)) ctx->task_complete = true; + if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) + ctx->lockless_cq = true; + + /* + * lazy poll_wq activation relies on ->task_complete for synchronisation + * purposes, see io_activate_pollwq() + */ + if (!ctx->task_complete) + ctx->poll_activated = true; + /* * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user * space applications don't need to do io completion events @@ -3682,22 +3979,23 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, ret = io_sq_offload_create(ctx, p); if (ret) goto err; - /* always set a rsrc node */ - ret = io_rsrc_node_switch_start(ctx); + + ret = io_rsrc_init(ctx); if (ret) goto err; - io_rsrc_node_switch(ctx, NULL); - memset(&p->sq_off, 0, sizeof(p->sq_off)); p->sq_off.head = offsetof(struct io_rings, sq.head); p->sq_off.tail = offsetof(struct io_rings, sq.tail); p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); p->sq_off.flags = offsetof(struct io_rings, sq_flags); p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); - p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) + p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; + p->sq_off.resv1 = 0; + if (!(ctx->flags & IORING_SETUP_NO_MMAP)) + p->sq_off.user_addr = 0; - memset(&p->cq_off, 0, sizeof(p->cq_off)); p->cq_off.head = offsetof(struct io_rings, cq.head); p->cq_off.tail = offsetof(struct io_rings, cq.tail); p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); @@ -3705,6 +4003,9 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); p->cq_off.cqes = offsetof(struct io_rings, cqes); p->cq_off.flags = offsetof(struct io_rings, cq_flags); + p->cq_off.resv1 = 0; + if (!(ctx->flags & IORING_SETUP_NO_MMAP)) + p->cq_off.user_addr = 0; p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | @@ -3712,7 +4013,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | - IORING_FEAT_LINKED_FILE; + IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; @@ -3729,22 +4030,30 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, goto err; } + ret = __io_uring_add_tctx_node(ctx); + if (ret) + goto err_fput; + tctx = current->io_uring; + /* * Install ring fd as the very last thing, so we don't risk someone * having closed it before we finish setup */ - ret = io_uring_install_fd(ctx, file); - if (ret < 0) { - /* fput will clean it up */ - fput(file); - return ret; - } + if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY) + ret = io_ring_add_registered_file(tctx, file, 0, IO_RINGFD_REG_MAX); + else + ret = io_uring_install_fd(file); + if (ret < 0) + goto err_fput; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: io_ring_ctx_wait_and_kill(ctx); return ret; +err_fput: + fput(file); + return ret; } /* @@ -3770,7 +4079,9 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | - IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN)) + IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | + IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY | + IORING_SETUP_NO_SQARRAY)) return -EINVAL; return io_uring_create(entries, &p, params); @@ -3779,17 +4090,25 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) static inline bool io_uring_allowed(void) { int disabled = READ_ONCE(sysctl_io_uring_disabled); + kgid_t io_uring_group; static bool printed = false; - if (disabled == 0 || (disabled == 1 && capable(CAP_SYS_ADMIN))) { - if (!printed) { - mark_tech_preview("io_uring", NULL); - printed = true; - } - return true; + if (!printed) { + mark_tech_preview("io_uring", NULL); + printed = true; } - return false; + if (disabled == 2) + return false; + + if (disabled == 0 || capable(CAP_SYS_ADMIN)) + return true; + + io_uring_group = make_kgid(&init_user_ns, sysctl_io_uring_group); + if (!gid_valid(io_uring_group)) + return false; + + return in_group_p(io_uring_group); } SYSCALL_DEFINE2(io_uring_setup, u32, entries, @@ -3828,7 +4147,7 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, for (i = 0; i < nr_args; i++) { p->ops[i].op = i; - if (!io_op_defs[i].not_supported) + if (!io_issue_defs[i].not_supported) p->ops[i].flags = IO_URING_OP_SUPPORTED; } p->ops_len = i; @@ -3933,8 +4252,15 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx) if (!(ctx->flags & IORING_SETUP_R_DISABLED)) return -EBADFD; - if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) + if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) { WRITE_ONCE(ctx->submitter_task, get_task_struct(current)); + /* + * Lazy activation attempts would fail if it was polled before + * submitter_task is set. + */ + if (wq_has_sleeper(&ctx->poll_wq)) + io_activate_pollwq(ctx); + } if (ctx->restrictions.registered) ctx->restricted = 1; @@ -3945,16 +4271,28 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx) return 0; } +static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx, + cpumask_var_t new_mask) +{ + int ret; + + if (!(ctx->flags & IORING_SETUP_SQPOLL)) { + ret = io_wq_cpu_affinity(current->io_uring, new_mask); + } else { + mutex_unlock(&ctx->uring_lock); + ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask); + mutex_lock(&ctx->uring_lock); + } + + return ret; +} + static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg, unsigned len) { - struct io_uring_task *tctx = current->io_uring; cpumask_var_t new_mask; int ret; - if (!tctx || !tctx->io_wq) - return -EINVAL; - if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) return -ENOMEM; @@ -3975,19 +4313,14 @@ static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, return -EFAULT; } - ret = io_wq_cpu_affinity(tctx->io_wq, new_mask); + ret = __io_register_iowq_aff(ctx, new_mask); free_cpumask_var(new_mask); return ret; } static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) { - struct io_uring_task *tctx = current->io_uring; - - if (!tctx || !tctx->io_wq) - return -EINVAL; - - return io_wq_cpu_affinity(tctx->io_wq, NULL); + return __io_register_iowq_aff(ctx, NULL); } static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, @@ -4245,17 +4578,36 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, struct io_ring_ctx *ctx; long ret = -EBADF; struct fd f; + bool use_registered_ring; + + use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); + opcode &= ~IORING_REGISTER_USE_REGISTERED_RING; if (opcode >= IORING_REGISTER_LAST) return -EINVAL; - f = fdget(fd); - if (!f.file) - return -EBADF; + if (use_registered_ring) { + /* + * Ring fd has been registered via IORING_REGISTER_RING_FDS, we + * need only dereference our task private array to find it. + */ + struct io_uring_task *tctx = current->io_uring; - ret = -EOPNOTSUPP; - if (!io_is_uring_fops(f.file)) - goto out_fput; + if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) + return -EINVAL; + fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); + f.file = tctx->registered_rings[fd]; + f.flags = 0; + if (unlikely(!f.file)) + return -EBADF; + } else { + f = fdget(fd); + if (unlikely(!f.file)) + return -EBADF; + ret = -EOPNOTSUPP; + if (!io_is_uring_fops(f.file)) + goto out_fput; + } ctx = f.file->private_data; @@ -4344,8 +4696,20 @@ static int __init io_uring_init(void) io_uring_optable_init(); - req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | - SLAB_ACCOUNT); + /* + * Allow user copy in the per-command field, which starts after the + * file in io_kiocb and until the opcode field. The openat2 handling + * requires copying in user memory into the io_kiocb object in that + * range, and HARDENED_USERCOPY will complain if we haven't + * correctly annotated this range. + */ + req_cachep = kmem_cache_create_usercopy("io_kiocb", + sizeof(struct io_kiocb), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC | + SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU, + offsetof(struct io_kiocb, cmd.data), + sizeof_field(struct io_kiocb, cmd.data), NULL); + #ifdef CONFIG_SYSCTL register_sysctl_init("kernel", kernel_io_uring_disabled_table); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 87426c0c6d3e..0bc145614a6e 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include "io-wq.h" @@ -14,6 +15,17 @@ #include #endif +enum { + /* + * A hint to not wake right away but delay until there are enough of + * tw's queued to match the number of CQEs the task is waiting for. + * + * Must not be used wirh requests generating more than one CQE. + * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set. + */ + IOU_F_TWQ_LAZY_WAKE = 1, +}; + enum { IOU_OK = 0, IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED, @@ -26,16 +38,13 @@ enum { IOU_STOP_MULTISHOT = -ECANCELED, }; -struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow); -bool io_req_cqe_overflow(struct io_kiocb *req); +bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); +void io_req_cqe_overflow(struct io_kiocb *req); int io_run_task_work_sig(struct io_ring_ctx *ctx); -int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked); -int io_run_local_work(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); -bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags, - bool allow_overflow); +bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); @@ -44,28 +53,26 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); -static inline bool io_req_ffs_set(struct io_kiocb *req) -{ - return req->flags & REQ_F_FIXED_FILE; -} - -void __io_req_task_work_add(struct io_kiocb *req, bool allow_local); +void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); bool io_is_uring_fops(struct file *file); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req); -void io_queue_iowq(struct io_kiocb *req, bool *dont_use); -void io_req_task_complete(struct io_kiocb *req, bool *locked); +void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use); +void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); void io_req_task_queue_fail(struct io_kiocb *req, int ret); -void io_req_task_submit(struct io_kiocb *req, bool *locked); +void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts); void tctx_task_work(struct callback_head *cb); __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); int io_uring_alloc_task_context(struct task_struct *task, struct io_ring_ctx *ctx); -int io_poll_issue(struct io_kiocb *req, bool *locked); +int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, + int start, int end); + +int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); -void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node); +void __io_submit_flush_completions(struct io_ring_ctx *ctx); int io_req_prep_async(struct io_kiocb *req); struct io_wq_work *io_wq_free_work(struct io_wq_work *work); @@ -73,61 +80,73 @@ void io_wq_submit_work(struct io_wq_work *work); void io_free_req(struct io_kiocb *req); void io_queue_next(struct io_kiocb *req); -void __io_put_task(struct task_struct *task, int nr); void io_task_refs_refill(struct io_uring_task *tctx); bool __io_alloc_req_refill(struct io_ring_ctx *ctx); bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all); -#define io_lockdep_assert_cq_locked(ctx) \ - do { \ - if (ctx->flags & IORING_SETUP_IOPOLL) { \ - lockdep_assert_held(&ctx->uring_lock); \ - } else if (!ctx->task_complete) { \ - lockdep_assert_held(&ctx->completion_lock); \ - } else if (ctx->submitter_task->flags & PF_EXITING) { \ - lockdep_assert(current_work()); \ - } else { \ - lockdep_assert(current == ctx->submitter_task); \ - } \ - } while (0) +#if defined(CONFIG_PROVE_LOCKING) +static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) +{ + lockdep_assert(in_task()); + + if (ctx->flags & IORING_SETUP_IOPOLL) { + lockdep_assert_held(&ctx->uring_lock); + } else if (!ctx->task_complete) { + lockdep_assert_held(&ctx->completion_lock); + } else if (ctx->submitter_task) { + /* + * ->submitter_task may be NULL and we can still post a CQE, + * if the ring has been setup with IORING_SETUP_R_DISABLED. + * Not from an SQE, as those cannot be submitted, but via + * updating tagged resources. + */ + if (ctx->submitter_task->flags & PF_EXITING) + lockdep_assert(current_work()); + else + lockdep_assert(current == ctx->submitter_task); + } +} +#else +static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) +{ +} +#endif static inline void io_req_task_work_add(struct io_kiocb *req) { - __io_req_task_work_add(req, true); + __io_req_task_work_add(req, 0); } #define io_for_each_link(pos, head) \ for (pos = (head); pos; pos = pos->link) -void io_cq_unlock_post(struct io_ring_ctx *ctx); - -static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx, - bool overflow) +static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx, + struct io_uring_cqe **ret, + bool overflow) { io_lockdep_assert_cq_locked(ctx); - if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { - struct io_uring_cqe *cqe = ctx->cqe_cached; - - ctx->cached_cq_tail++; - ctx->cqe_cached++; - if (ctx->flags & IORING_SETUP_CQE32) - ctx->cqe_cached++; - return cqe; + if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) { + if (unlikely(!io_cqe_cache_refill(ctx, overflow))) + return false; } - - return __io_get_cqe(ctx, overflow); + *ret = ctx->cqe_cached; + ctx->cached_cq_tail++; + ctx->cqe_cached++; + if (ctx->flags & IORING_SETUP_CQE32) + ctx->cqe_cached++; + return true; } -static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) +static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret) { - return io_get_cqe_overflow(ctx, false); + return io_get_cqe_overflow(ctx, ret, false); } -static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, - struct io_kiocb *req) +static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, + struct io_kiocb *req) { struct io_uring_cqe *cqe; @@ -136,39 +155,22 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, * submission (by quite a lot). Increment the overflow count in * the ring. */ - cqe = io_get_cqe(ctx); - if (unlikely(!cqe)) + if (unlikely(!io_get_cqe(ctx, &cqe))) return false; - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - (req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0, - (req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0); + if (trace_io_uring_complete_enabled()) + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, + req->cqe.res, req->cqe.flags, + req->big_cqe.extra1, req->big_cqe.extra2); memcpy(cqe, &req->cqe, sizeof(*cqe)); - if (ctx->flags & IORING_SETUP_CQE32) { - u64 extra1 = 0, extra2 = 0; - - if (req->flags & REQ_F_CQE32_INIT) { - extra1 = req->extra1; - extra2 = req->extra2; - } - - WRITE_ONCE(cqe->big_cqe[0], extra1); - WRITE_ONCE(cqe->big_cqe[1], extra2); + memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe)); + memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } return true; } -static inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - if (likely(__io_fill_cqe_req(ctx, req))) - return true; - return io_req_cqe_overflow(req); -} - static inline void req_set_fail(struct io_kiocb *req) { req->flags |= REQ_F_FAIL; @@ -189,10 +191,10 @@ static inline bool req_has_async_data(struct io_kiocb *req) return req->flags & REQ_F_ASYNC_DATA; } -static inline void io_put_file(struct file *file) +static inline void io_put_file(struct io_kiocb *req) { - if (file) - fput(file); + if (!(req->flags & REQ_F_FIXED_FILE) && req->file) + fput(req->file); } static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx, @@ -223,8 +225,14 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx) smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); } -/* requires smb_mb() prior, see wq_has_sleeper() */ -static inline void __io_cqring_wake(struct io_ring_ctx *ctx) +static inline void io_poll_wq_wake(struct io_ring_ctx *ctx) +{ + if (wq_has_sleeper(&ctx->poll_wq)) + __wake_up(&ctx->poll_wq, TASK_NORMAL, 0, + poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); +} + +static inline void io_cqring_wake(struct io_ring_ctx *ctx) { /* * Trigger waitqueue handler on all waiters on our waitqueue. This @@ -236,17 +244,11 @@ static inline void __io_cqring_wake(struct io_ring_ctx *ctx) * waitqueue handlers, we know we have a dependency between eventfd or * epoll and should terminate multishot poll at that point. */ - if (waitqueue_active(&ctx->cq_wait)) + if (wq_has_sleeper(&ctx->cq_wait)) __wake_up(&ctx->cq_wait, TASK_NORMAL, 0, poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); } -static inline void io_cqring_wake(struct io_ring_ctx *ctx) -{ - smp_mb(); - __io_cqring_wake(ctx); -} - static inline bool io_sqring_full(struct io_ring_ctx *ctx) { struct io_rings *r = ctx->rings; @@ -257,9 +259,11 @@ static inline bool io_sqring_full(struct io_ring_ctx *ctx) static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; + unsigned int entries; /* make sure SQ entry isn't read before tail */ - return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; + entries = smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; + return min(entries, ctx->sq_entries); } static inline int io_run_task_work(void) @@ -294,47 +298,11 @@ static inline bool io_task_work_pending(struct io_ring_ctx *ctx) return task_work_pending(current) || !wq_list_empty(&ctx->work_llist); } -static inline int io_run_task_work_ctx(struct io_ring_ctx *ctx) +static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts) { - int ret = 0; - int ret2; - - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) - ret = io_run_local_work(ctx); - - /* want to run this after in case more is added */ - ret2 = io_run_task_work(); - - /* Try propagate error in favour of if tasks were run, - * but still make sure to run them if requested - */ - if (ret >= 0) - ret += ret2; - - return ret; -} - -static inline int io_run_local_work_locked(struct io_ring_ctx *ctx) -{ - bool locked; - int ret; - - if (llist_empty(&ctx->work_llist)) - return 0; - - locked = true; - ret = __io_run_local_work(ctx, &locked); - /* shouldn't happen! */ - if (WARN_ON_ONCE(!locked)) + if (!ts->locked) { mutex_lock(&ctx->uring_lock); - return ret; -} - -static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) -{ - if (!*locked) { - mutex_lock(&ctx->uring_lock); - *locked = true; + ts->locked = true; } } @@ -355,19 +323,11 @@ static inline void io_req_complete_defer(struct io_kiocb *req) static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) { - if (unlikely(ctx->off_timeout_used || ctx->drain_active || ctx->has_evfd)) + if (unlikely(ctx->off_timeout_used || ctx->drain_active || + ctx->has_evfd || ctx->poll_activated)) __io_commit_cqring_flush(ctx); } -/* must to be called somewhat shortly after putting a request */ -static inline void io_put_task(struct task_struct *task, int nr) -{ - if (likely(task == current)) - task->io_uring->cached_refs += nr; - else - __io_put_task(task, nr); -} - static inline void io_get_task_refs(int nr) { struct io_uring_task *tctx = current->io_uring; @@ -382,19 +342,30 @@ static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) return !ctx->submit_state.free_list.next; } -static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx) +extern struct kmem_cache *req_cachep; + +static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx) { - if (unlikely(io_req_cache_empty(ctx))) - return __io_alloc_req_refill(ctx); + struct io_kiocb *req; + + req = container_of(ctx->submit_state.free_list.next, struct io_kiocb, comp_list); + wq_stack_extract(&ctx->submit_state.free_list); + return req; +} + +static inline bool io_alloc_req(struct io_ring_ctx *ctx, struct io_kiocb **req) +{ + if (unlikely(io_req_cache_empty(ctx))) { + if (!__io_alloc_req_refill(ctx)) + return false; + } + *req = io_extract_req(ctx); return true; } -static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) +static inline bool io_allowed_defer_tw_run(struct io_ring_ctx *ctx) { - struct io_wq_work_node *node; - - node = wq_stack_extract(&ctx->submit_state.free_list); - return container_of(node, struct io_kiocb, comp_list); + return likely(ctx->submitter_task == current); } static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx) @@ -410,4 +381,14 @@ static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res) io_req_task_work_add(req); } +/* + * IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each + * slot. + */ +static inline size_t uring_sqe_size(struct io_ring_ctx *ctx) +{ + if (ctx->flags & IORING_SETUP_SQE128) + return 2 * sizeof(struct io_uring_sqe); + return sizeof(struct io_uring_sqe); +} #endif diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index a90c820ce99e..9123138aa9f4 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -137,7 +137,8 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, return NULL; head &= bl->mask; - if (head < IO_BUFFER_LIST_BUF_PER_PAGE) { + /* mmaped buffers are always contig */ + if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) { buf = &br->bufs[head]; } else { int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); @@ -179,7 +180,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, bl = io_buffer_get_list(ctx, req->buf_index); if (likely(bl)) { - if (bl->buf_nr_pages) + if (bl->is_mapped) ret = io_ring_buffer_select(req, len, bl, issue_flags); else ret = io_provided_buffer_select(req, len, bl); @@ -214,17 +215,24 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (!nbufs) return 0; - if (bl->buf_nr_pages) { - int j; - + if (bl->is_mapped) { i = bl->buf_ring->tail - bl->head; - for (j = 0; j < bl->buf_nr_pages; j++) - unpin_user_page(bl->buf_pages[j]); - kvfree(bl->buf_pages); - bl->buf_pages = NULL; - bl->buf_nr_pages = 0; + if (bl->is_mmap) { + folio_put(virt_to_folio(bl->buf_ring)); + bl->buf_ring = NULL; + bl->is_mmap = 0; + } else if (bl->buf_nr_pages) { + int j; + + for (j = 0; j < bl->buf_nr_pages; j++) + unpin_user_page(bl->buf_pages[j]); + kvfree(bl->buf_pages); + bl->buf_pages = NULL; + bl->buf_nr_pages = 0; + } /* make sure it's seen as empty */ INIT_LIST_HEAD(&bl->buf_list); + bl->is_mapped = 0; return i; } @@ -304,7 +312,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) if (bl) { ret = -EINVAL; /* can't use provide/remove buffers command on mapped buffers */ - if (!bl->buf_nr_pages) + if (!bl->is_mapped) ret = __io_remove_buffers(ctx, bl, p->nbufs); } io_ring_submit_unlock(ctx, issue_flags); @@ -449,7 +457,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) } } /* can't add buffers via this command for a mapped buffer ring */ - if (bl->buf_nr_pages) { + if (bl->is_mapped) { ret = -EINVAL; goto err; } @@ -464,23 +472,98 @@ err: return IOU_OK; } -int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, + struct io_buffer_list *bl) { struct io_uring_buf_ring *br; + struct page **pages; + int i, nr_pages; + + pages = io_pin_pages(reg->ring_addr, + flex_array_size(br, bufs, reg->ring_entries), + &nr_pages); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + /* + * Apparently some 32-bit boxes (ARM) will return highmem pages, + * which then need to be mapped. We could support that, but it'd + * complicate the code and slowdown the common cases quite a bit. + * So just error out, returning -EINVAL just like we did on kernels + * that didn't support mapped buffer rings. + */ + for (i = 0; i < nr_pages; i++) + if (PageHighMem(pages[i])) + goto error_unpin; + + br = page_address(pages[0]); +#ifdef SHM_COLOUR + /* + * On platforms that have specific aliasing requirements, SHM_COLOUR + * is set and we must guarantee that the kernel and user side align + * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and + * the application mmap's the provided ring buffer. Fail the request + * if we, by chance, don't end up with aligned addresses. The app + * should use IOU_PBUF_RING_MMAP instead, and liburing will handle + * this transparently. + */ + if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) + goto error_unpin; +#endif + bl->buf_pages = pages; + bl->buf_nr_pages = nr_pages; + bl->buf_ring = br; + bl->is_mapped = 1; + bl->is_mmap = 0; + return 0; +error_unpin: + for (i = 0; i < nr_pages; i++) + unpin_user_page(pages[i]); + kvfree(pages); + return -EINVAL; +} + +static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg, + struct io_buffer_list *bl) +{ + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; + size_t ring_size; + void *ptr; + + ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); + ptr = (void *) __get_free_pages(gfp, get_order(ring_size)); + if (!ptr) + return -ENOMEM; + + bl->buf_ring = ptr; + bl->is_mapped = 1; + bl->is_mmap = 1; + return 0; +} + +int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +{ struct io_uring_buf_reg reg; struct io_buffer_list *bl, *free_bl = NULL; - struct page **pages; - int nr_pages; + int ret; if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) + if (reg.resv[0] || reg.resv[1] || reg.resv[2]) return -EINVAL; - if (!reg.ring_addr) - return -EFAULT; - if (reg.ring_addr & ~PAGE_MASK) + if (reg.flags & ~IOU_PBUF_RING_MMAP) return -EINVAL; + if (!(reg.flags & IOU_PBUF_RING_MMAP)) { + if (!reg.ring_addr) + return -EFAULT; + if (reg.ring_addr & ~PAGE_MASK) + return -EINVAL; + } else { + if (reg.ring_addr) + return -EINVAL; + } + if (!is_power_of_2(reg.ring_entries)) return -EINVAL; @@ -497,7 +580,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) bl = io_buffer_get_list(ctx, reg.bgid); if (bl) { /* if mapped buffer ring OR classic exists, don't allow */ - if (bl->buf_nr_pages || !list_empty(&bl->buf_list)) + if (bl->is_mapped || !list_empty(&bl->buf_list)) return -EEXIST; } else { free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); @@ -505,22 +588,21 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return -ENOMEM; } - pages = io_pin_pages(reg.ring_addr, - flex_array_size(br, bufs, reg.ring_entries), - &nr_pages); - if (IS_ERR(pages)) { - kfree(free_bl); - return PTR_ERR(pages); + if (!(reg.flags & IOU_PBUF_RING_MMAP)) + ret = io_pin_pbuf_ring(®, bl); + else + ret = io_alloc_pbuf_ring(®, bl); + + if (!ret) { + bl->nr_entries = reg.ring_entries; + bl->mask = reg.ring_entries - 1; + + io_buffer_add_list(ctx, bl, reg.bgid); + return 0; } - br = page_address(pages[0]); - bl->buf_pages = pages; - bl->buf_nr_pages = nr_pages; - bl->nr_entries = reg.ring_entries; - bl->buf_ring = br; - bl->mask = reg.ring_entries - 1; - io_buffer_add_list(ctx, bl, reg.bgid); - return 0; + kfree(free_bl); + return ret; } int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) @@ -530,13 +612,15 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) + if (reg.resv[0] || reg.resv[1] || reg.resv[2]) + return -EINVAL; + if (reg.flags) return -EINVAL; bl = io_buffer_get_list(ctx, reg.bgid); if (!bl) return -ENOENT; - if (!bl->buf_nr_pages) + if (!bl->is_mapped) return -EINVAL; __io_remove_buffers(ctx, bl, -1U); @@ -546,3 +630,14 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) } return 0; } + +void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid) +{ + struct io_buffer_list *bl; + + bl = io_buffer_get_list(ctx, bgid); + if (!bl || !bl->is_mmap) + return NULL; + + return bl->buf_ring; +} diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index c23e15d7d3ca..d14345ef61fc 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -23,6 +23,11 @@ struct io_buffer_list { __u16 nr_entries; __u16 head; __u16 mask; + + /* ring mapped provided buffers */ + __u8 is_mapped; + /* ring mapped provided buffers, but mmap'ed by application */ + __u8 is_mmap; }; struct io_buffer { @@ -50,6 +55,8 @@ unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); +void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid); + static inline void io_kbuf_recycle_ring(struct io_kiocb *req) { /* diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index be2695eb45ec..cd6dcf634ba3 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -13,6 +13,11 @@ #include "filetable.h" #include "msg_ring.h" + +/* All valid masks for MSG_RING */ +#define IORING_MSG_RING_MASK (IORING_MSG_RING_CQE_SKIP | \ + IORING_MSG_RING_FLAGS_PASS) + struct io_msg { struct file *file; struct file *src_file; @@ -21,7 +26,10 @@ struct io_msg { u32 len; u32 cmd; u32 src_fd; - u32 dst_fd; + union { + u32 dst_fd; + u32 cqe_flags; + }; u32 flags; }; @@ -91,6 +99,11 @@ static void io_msg_tw_complete(struct callback_head *head) if (current->flags & PF_EXITING) { ret = -EOWNERDEAD; } else { + u32 flags = 0; + + if (msg->flags & IORING_MSG_RING_FLAGS_PASS) + flags = msg->cqe_flags; + /* * If the target ring is using IOPOLL mode, then we need to be * holding the uring_lock for posting completions. Other ring @@ -99,7 +112,7 @@ static void io_msg_tw_complete(struct callback_head *head) */ if (target_ctx->flags & IORING_SETUP_IOPOLL) mutex_lock(&target_ctx->uring_lock); - if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) + if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) ret = -EOVERFLOW; if (target_ctx->flags & IORING_SETUP_IOPOLL) mutex_unlock(&target_ctx->uring_lock); @@ -114,9 +127,12 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *target_ctx = req->file->private_data; struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); + u32 flags = 0; int ret; - if (msg->src_fd || msg->dst_fd || msg->flags) + if (msg->src_fd || msg->flags & ~IORING_MSG_RING_FLAGS_PASS) + return -EINVAL; + if (!(msg->flags & IORING_MSG_RING_FLAGS_PASS) && msg->dst_fd) return -EINVAL; if (target_ctx->flags & IORING_SETUP_R_DISABLED) return -EBADFD; @@ -124,15 +140,18 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) if (io_msg_need_remote(target_ctx)) return io_msg_exec_remote(req, io_msg_tw_complete); + if (msg->flags & IORING_MSG_RING_FLAGS_PASS) + flags = msg->cqe_flags; + ret = -EOVERFLOW; if (target_ctx->flags & IORING_SETUP_IOPOLL) { if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) return -EAGAIN; - if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) + if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) ret = 0; io_double_unlock_ctx(target_ctx); } else { - if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) + if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) ret = 0; } return ret; @@ -143,14 +162,12 @@ static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_fl struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); struct io_ring_ctx *ctx = req->ctx; struct file *file = NULL; - unsigned long file_ptr; int idx = msg->src_fd; io_ring_submit_lock(ctx, issue_flags); if (likely(idx < ctx->nr_user_files)) { idx = array_index_nospec(idx, ctx->nr_user_files); - file_ptr = io_fixed_file_slot(&ctx->file_table, idx)->file_ptr; - file = (struct file *) (file_ptr & FFS_MASK); + file = io_file_from_index(&ctx->file_table, idx); if (file) get_file(file); } @@ -243,7 +260,7 @@ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) msg->src_fd = READ_ONCE(sqe->addr3); msg->dst_fd = READ_ONCE(sqe->file_index); msg->flags = READ_ONCE(sqe->msg_ring_flags); - if (msg->flags & ~IORING_MSG_RING_CQE_SKIP) + if (msg->flags & ~IORING_MSG_RING_MASK) return -EINVAL; return 0; diff --git a/io_uring/net.c b/io_uring/net.c index 3136591fd4b9..6862b1ca981d 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -92,6 +92,7 @@ int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; shutdown->how = READ_ONCE(sqe->len); + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -101,8 +102,7 @@ int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) struct socket *sock; int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); sock = sock_from_file(req->file); if (unlikely(!sock)) @@ -183,10 +183,14 @@ static int io_setup_async_msg(struct io_kiocb *req, memcpy(async_msg, kmsg, sizeof(*kmsg)); if (async_msg->msg.msg_name) async_msg->msg.msg_name = &async_msg->addr; + + if ((req->flags & REQ_F_BUFFER_SELECT) && !async_msg->msg.msg_iter.nr_segs) + return -EAGAIN; + /* if were using fast_iov, set it to the new one */ - if (!kmsg->free_iov) { - size_t fast_idx = kmsg->msg.msg_iter.iov - kmsg->fast_iov; - async_msg->msg.msg_iter.iov = &async_msg->fast_iov[fast_idx]; + if (iter_is_iovec(&kmsg->msg.msg_iter) && !kmsg->free_iov) { + size_t fast_idx = iter_iov(&kmsg->msg.msg_iter) - kmsg->fast_iov; + async_msg->msg.msg_iter.__iov = &async_msg->fast_iov[fast_idx]; } return -EAGAIN; @@ -354,7 +358,6 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) struct sockaddr_storage __address; struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct msghdr msg; - struct iovec iov; struct socket *sock; unsigned flags; int min_ret = 0; @@ -388,7 +391,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; - ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); + ret = import_ubuf(WRITE, sr->buf, sr->len, &msg.msg_iter); if (unlikely(ret)) return ret; @@ -398,6 +401,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) if (flags & MSG_WAITALL) min_ret = iov_iter_count(&msg.msg_iter); + flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; msg.msg_flags = flags; ret = sock_sendmsg(sock, &msg); if (ret < min_ret) { @@ -542,6 +546,7 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { iomsg->msg.msg_name = &iomsg->addr; + iomsg->msg.msg_iter.nr_segs = 0; #ifdef CONFIG_COMPAT if (req->ctx->compat) @@ -625,9 +630,15 @@ static inline void io_recv_prep_retry(struct io_kiocb *req) * again (for multishot). */ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, - unsigned int cflags, bool mshot_finished, + struct msghdr *msg, bool mshot_finished, unsigned issue_flags) { + unsigned int cflags; + + cflags = io_put_kbuf(req, issue_flags); + if (msg->msg_inq && msg->msg_inq != -1) + cflags |= IORING_CQE_F_SOCK_NONEMPTY; + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { io_req_set_res(req, *ret, cflags); *ret = IOU_OK; @@ -635,10 +646,18 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, } if (!mshot_finished) { - if (io_aux_cqe(req->ctx, issue_flags & IO_URING_F_COMPLETE_DEFER, - req->cqe.user_data, *ret, cflags | IORING_CQE_F_MORE, true)) { + if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, + *ret, cflags | IORING_CQE_F_MORE)) { io_recv_prep_retry(req); - return false; + /* Known not-empty or unknown state, retry */ + if (cflags & IORING_CQE_F_SOCK_NONEMPTY || + msg->msg_inq == -1) + return false; + if (issue_flags & IO_URING_F_MULTISHOT) + *ret = IOU_ISSUE_SKIP_COMPLETE; + else + *ret = -EAGAIN; + return true; } /* Otherwise stop multishot but use the current result. */ } @@ -741,7 +760,6 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr iomsg, *kmsg; struct socket *sock; - unsigned int cflags; unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; @@ -784,25 +802,26 @@ retry_multishot: } } - kmsg->fast_iov[0].iov_base = buf; - kmsg->fast_iov[0].iov_len = len; - iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1, - len); + iov_iter_ubuf(&kmsg->msg.msg_iter, READ, buf, len); } flags = sr->msg_flags; if (force_nonblock) flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&kmsg->msg.msg_iter); kmsg->msg.msg_get_inq = 1; - if (req->flags & REQ_F_APOLL_MULTISHOT) + kmsg->msg.msg_inq = -1; + if (req->flags & REQ_F_APOLL_MULTISHOT) { ret = io_recvmsg_multishot(sock, sr, kmsg, flags, &mshot_finished); - else + } else { + /* disable partial retry for recvmsg with cmsg attached */ + if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen) + min_ret = iov_iter_count(&kmsg->msg.msg_iter); + ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags); + } if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { @@ -832,11 +851,7 @@ retry_multishot: else io_kbuf_recycle(req, issue_flags); - cflags = io_put_kbuf(req, issue_flags); - if (kmsg->msg.msg_inq) - cflags |= IORING_CQE_F_SOCK_NONEMPTY; - - if (!io_recv_finish(req, &ret, cflags, mshot_finished, issue_flags)) + if (!io_recv_finish(req, &ret, &kmsg->msg, mshot_finished, issue_flags)) goto retry_multishot; if (mshot_finished) { @@ -855,8 +870,6 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct msghdr msg; struct socket *sock; - struct iovec iov; - unsigned int cflags; unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; @@ -873,6 +886,14 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_control = NULL; + msg.msg_get_inq = 1; + msg.msg_controllen = 0; + msg.msg_iocb = NULL; + msg.msg_ubuf = NULL; + retry_multishot: if (io_do_buffer_select(req)) { void __user *buf; @@ -883,18 +904,12 @@ retry_multishot: sr->buf = buf; } - ret = import_single_range(READ, sr->buf, len, &iov, &msg.msg_iter); + ret = import_ubuf(READ, sr->buf, len, &msg.msg_iter); if (unlikely(ret)) goto out_free; - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_control = NULL; - msg.msg_get_inq = 1; + msg.msg_inq = -1; msg.msg_flags = 0; - msg.msg_controllen = 0; - msg.msg_iocb = NULL; - msg.msg_ubuf = NULL; flags = sr->msg_flags; if (force_nonblock) @@ -934,11 +949,7 @@ out_free: else io_kbuf_recycle(req, issue_flags); - cflags = io_put_kbuf(req, issue_flags); - if (msg.msg_inq) - cflags |= IORING_CQE_F_SOCK_NONEMPTY; - - if (!io_recv_finish(req, &ret, cflags, ret <= 0, issue_flags)) + if (!io_recv_finish(req, &ret, &msg, ret <= 0, issue_flags)) goto retry_multishot; return ret; @@ -1094,7 +1105,6 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) struct sockaddr_storage __address; struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); struct msghdr msg; - struct iovec iov; struct socket *sock; unsigned msg_flags; int ret, min_ret = 0; @@ -1136,8 +1146,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) msg.sg_from_iter = io_sg_from_iter; } else { io_notif_set_extended(zc->notif); - ret = import_single_range(WRITE, zc->buf, zc->len, &iov, - &msg.msg_iter); + ret = import_ubuf(WRITE, zc->buf, zc->len, &msg.msg_iter); if (unlikely(ret)) return ret; ret = io_notif_account_mem(zc->notif, zc->len); @@ -1151,6 +1160,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) msg_flags |= MSG_DONTWAIT; if (msg_flags & MSG_WAITALL) min_ret = iov_iter_count(&msg.msg_iter); + msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; msg.msg_flags = msg_flags; msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; @@ -1312,7 +1322,6 @@ int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int io_accept(struct io_kiocb *req, unsigned int issue_flags) { - struct io_ring_ctx *ctx = req->ctx; struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; @@ -1362,8 +1371,8 @@ retry: if (ret < 0) return ret; - if (io_aux_cqe(ctx, issue_flags & IO_URING_F_COMPLETE_DEFER, - req->cqe.user_data, ret, IORING_CQE_F_MORE, true)) + if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, + ret, IORING_CQE_F_MORE)) goto retry; return -ECANCELED; diff --git a/io_uring/net.h b/io_uring/net.h index 5ffa11bf5d2e..191009979bcb 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -5,8 +5,8 @@ #include "alloc_cache.h" -#if defined(CONFIG_NET) struct io_async_msghdr { +#if defined(CONFIG_NET) union { struct iovec fast_iov[UIO_FASTIOV]; struct { @@ -22,8 +22,11 @@ struct io_async_msghdr { struct sockaddr __user *uaddr; struct msghdr msg; struct sockaddr_storage addr; +#endif }; +#if defined(CONFIG_NET) + struct io_async_connect { struct sockaddr_storage address; }; diff --git a/io_uring/notif.c b/io_uring/notif.c index c4bb793ebf0e..d3e703c37aba 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -9,7 +9,7 @@ #include "notif.h" #include "rsrc.h" -static void io_notif_complete_tw_ext(struct io_kiocb *notif, bool *locked) +static void io_notif_complete_tw_ext(struct io_kiocb *notif, struct io_tw_state *ts) { struct io_notif_data *nd = io_notif_to_data(notif); struct io_ring_ctx *ctx = notif->ctx; @@ -21,7 +21,7 @@ static void io_notif_complete_tw_ext(struct io_kiocb *notif, bool *locked) __io_unaccount_mem(ctx->user, nd->account_pages); nd->account_pages = 0; } - io_req_task_complete(notif, locked); + io_req_task_complete(notif, ts); } static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg, @@ -31,7 +31,7 @@ static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg, struct io_kiocb *notif = cmd_to_io_kiocb(nd); if (refcount_dec_and_test(&uarg->refcnt)) - io_req_task_work_add(notif); + __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); } static void io_tx_ubuf_callback_ext(struct sk_buff *skb, struct ubuf_info *uarg, @@ -68,9 +68,8 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) struct io_kiocb *notif; struct io_notif_data *nd; - if (unlikely(!io_alloc_req_refill(ctx))) + if (unlikely(!io_alloc_req(ctx, ¬if))) return NULL; - notif = io_alloc_req(ctx); notif->opcode = IORING_OP_NOP; notif->flags = 0; notif->file = NULL; @@ -80,7 +79,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) notif->io_task_work.func = io_req_task_complete; nd = io_notif_to_data(notif); - nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; + nd->uarg.flags = IO_NOTIF_UBUF_FLAGS; nd->uarg.callback = io_tx_ubuf_callback; refcount_set(&nd->uarg.refcnt, 1); return notif; diff --git a/io_uring/notif.h b/io_uring/notif.h index c88c800cd89d..86d32bd9f856 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -7,6 +7,7 @@ #include "rsrc.h" +#define IO_NOTIF_UBUF_FLAGS (SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN) #define IO_NOTIF_SPLICE_BATCH 32 struct io_notif_data { @@ -33,7 +34,7 @@ static inline void io_notif_flush(struct io_kiocb *notif) /* drop slot's master ref */ if (refcount_dec_and_test(&nd->uarg.refcnt)) - io_req_task_work_add(notif); + __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); } static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len) diff --git a/io_uring/opdef.c b/io_uring/opdef.c index d3f36c633ceb..3b9c6489b8b6 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -46,11 +46,10 @@ static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, return -EOPNOTSUPP; } -const struct io_op_def io_op_defs[] = { +const struct io_issue_def io_issue_defs[] = { [IORING_OP_NOP] = { .audit_skip = 1, .iopoll = 1, - .name = "NOP", .prep = io_nop_prep, .issue = io_nop, }, @@ -64,13 +63,8 @@ const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .async_size = sizeof(struct io_async_rw), - .name = "READV", .prep = io_prep_rw, .issue = io_read, - .prep_async = io_readv_prep_async, - .cleanup = io_readv_writev_cleanup, - .fail = io_rw_fail, }, [IORING_OP_WRITEV] = { .needs_file = 1, @@ -82,18 +76,12 @@ const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .async_size = sizeof(struct io_async_rw), - .name = "WRITEV", .prep = io_prep_rw, .issue = io_write, - .prep_async = io_writev_prep_async, - .cleanup = io_readv_writev_cleanup, - .fail = io_rw_fail, }, [IORING_OP_FSYNC] = { .needs_file = 1, .audit_skip = 1, - .name = "FSYNC", .prep = io_fsync_prep, .issue = io_fsync, }, @@ -106,11 +94,8 @@ const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .async_size = sizeof(struct io_async_rw), - .name = "READ_FIXED", .prep = io_prep_rw, .issue = io_read, - .fail = io_rw_fail, }, [IORING_OP_WRITE_FIXED] = { .needs_file = 1, @@ -122,30 +107,24 @@ const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .async_size = sizeof(struct io_async_rw), - .name = "WRITE_FIXED", .prep = io_prep_rw, .issue = io_write, - .fail = io_rw_fail, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, .unbound_nonreg_file = 1, .audit_skip = 1, - .name = "POLL_ADD", .prep = io_poll_add_prep, .issue = io_poll_add, }, [IORING_OP_POLL_REMOVE] = { .audit_skip = 1, - .name = "POLL_REMOVE", .prep = io_poll_remove_prep, .issue = io_poll_remove, }, [IORING_OP_SYNC_FILE_RANGE] = { .needs_file = 1, .audit_skip = 1, - .name = "SYNC_FILE_RANGE", .prep = io_sfr_prep, .issue = io_sync_file_range, }, @@ -155,14 +134,9 @@ const struct io_op_def io_op_defs[] = { .pollout = 1, .ioprio = 1, .manual_alloc = 1, - .name = "SENDMSG", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), .prep = io_sendmsg_prep, .issue = io_sendmsg, - .prep_async = io_sendmsg_prep_async, - .cleanup = io_sendmsg_recvmsg_cleanup, - .fail = io_sendrecv_fail, #else .prep = io_eopnotsupp_prep, #endif @@ -174,29 +148,21 @@ const struct io_op_def io_op_defs[] = { .buffer_select = 1, .ioprio = 1, .manual_alloc = 1, - .name = "RECVMSG", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), .prep = io_recvmsg_prep, .issue = io_recvmsg, - .prep_async = io_recvmsg_prep_async, - .cleanup = io_sendmsg_recvmsg_cleanup, - .fail = io_sendrecv_fail, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_TIMEOUT] = { .audit_skip = 1, - .async_size = sizeof(struct io_timeout_data), - .name = "TIMEOUT", .prep = io_timeout_prep, .issue = io_timeout, }, [IORING_OP_TIMEOUT_REMOVE] = { /* used by timeout updates' prep() */ .audit_skip = 1, - .name = "TIMEOUT_REMOVE", .prep = io_timeout_remove_prep, .issue = io_timeout_remove, }, @@ -206,7 +172,6 @@ const struct io_op_def io_op_defs[] = { .pollin = 1, .poll_exclusive = 1, .ioprio = 1, /* used for flags */ - .name = "ACCEPT", #if defined(CONFIG_NET) .prep = io_accept_prep, .issue = io_accept, @@ -216,14 +181,11 @@ const struct io_op_def io_op_defs[] = { }, [IORING_OP_ASYNC_CANCEL] = { .audit_skip = 1, - .name = "ASYNC_CANCEL", .prep = io_async_cancel_prep, .issue = io_async_cancel, }, [IORING_OP_LINK_TIMEOUT] = { .audit_skip = 1, - .async_size = sizeof(struct io_timeout_data), - .name = "LINK_TIMEOUT", .prep = io_link_timeout_prep, .issue = io_no_issue, }, @@ -231,46 +193,36 @@ const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, - .name = "CONNECT", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_connect), .prep = io_connect_prep, .issue = io_connect, - .prep_async = io_connect_prep_async, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_FALLOCATE] = { .needs_file = 1, - .name = "FALLOCATE", .prep = io_fallocate_prep, .issue = io_fallocate, }, [IORING_OP_OPENAT] = { - .name = "OPENAT", .prep = io_openat_prep, .issue = io_openat, - .cleanup = io_open_cleanup, }, [IORING_OP_CLOSE] = { - .name = "CLOSE", .prep = io_close_prep, .issue = io_close, }, [IORING_OP_FILES_UPDATE] = { .audit_skip = 1, .iopoll = 1, - .name = "FILES_UPDATE", .prep = io_files_update_prep, .issue = io_files_update, }, [IORING_OP_STATX] = { .audit_skip = 1, - .name = "STATX", .prep = io_statx_prep, .issue = io_statx, - .cleanup = io_statx_cleanup, }, [IORING_OP_READ] = { .needs_file = 1, @@ -282,11 +234,8 @@ const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .async_size = sizeof(struct io_async_rw), - .name = "READ", .prep = io_prep_rw, .issue = io_read, - .fail = io_rw_fail, }, [IORING_OP_WRITE] = { .needs_file = 1, @@ -298,22 +247,17 @@ const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .async_size = sizeof(struct io_async_rw), - .name = "WRITE", .prep = io_prep_rw, .issue = io_write, - .fail = io_rw_fail, }, [IORING_OP_FADVISE] = { .needs_file = 1, .audit_skip = 1, - .name = "FADVISE", .prep = io_fadvise_prep, .issue = io_fadvise, }, [IORING_OP_MADVISE] = { .audit_skip = 1, - .name = "MADVISE", .prep = io_madvise_prep, .issue = io_madvise, }, @@ -324,13 +268,9 @@ const struct io_op_def io_op_defs[] = { .audit_skip = 1, .ioprio = 1, .manual_alloc = 1, - .name = "SEND", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), .prep = io_sendmsg_prep, .issue = io_send, - .fail = io_sendrecv_fail, - .prep_async = io_send_prep_async, #else .prep = io_eopnotsupp_prep, #endif @@ -342,25 +282,20 @@ const struct io_op_def io_op_defs[] = { .buffer_select = 1, .audit_skip = 1, .ioprio = 1, - .name = "RECV", #if defined(CONFIG_NET) .prep = io_recvmsg_prep, .issue = io_recv, - .fail = io_sendrecv_fail, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_OPENAT2] = { - .name = "OPENAT2", .prep = io_openat2_prep, .issue = io_openat2, - .cleanup = io_open_cleanup, }, [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, .audit_skip = 1, - .name = "EPOLL", #if defined(CONFIG_EPOLL) .prep = io_epoll_ctl_prep, .issue = io_epoll_ctl, @@ -373,21 +308,18 @@ const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, .audit_skip = 1, - .name = "SPLICE", .prep = io_splice_prep, .issue = io_splice, }, [IORING_OP_PROVIDE_BUFFERS] = { .audit_skip = 1, .iopoll = 1, - .name = "PROVIDE_BUFFERS", .prep = io_provide_buffers_prep, .issue = io_provide_buffers, }, [IORING_OP_REMOVE_BUFFERS] = { .audit_skip = 1, .iopoll = 1, - .name = "REMOVE_BUFFERS", .prep = io_remove_buffers_prep, .issue = io_remove_buffers, }, @@ -396,13 +328,11 @@ const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, .audit_skip = 1, - .name = "TEE", .prep = io_tee_prep, .issue = io_tee, }, [IORING_OP_SHUTDOWN] = { .needs_file = 1, - .name = "SHUTDOWN", #if defined(CONFIG_NET) .prep = io_shutdown_prep, .issue = io_shutdown, @@ -411,72 +341,51 @@ const struct io_op_def io_op_defs[] = { #endif }, [IORING_OP_RENAMEAT] = { - .name = "RENAMEAT", .prep = io_renameat_prep, .issue = io_renameat, - .cleanup = io_renameat_cleanup, }, [IORING_OP_UNLINKAT] = { - .name = "UNLINKAT", .prep = io_unlinkat_prep, .issue = io_unlinkat, - .cleanup = io_unlinkat_cleanup, }, [IORING_OP_MKDIRAT] = { - .name = "MKDIRAT", .prep = io_mkdirat_prep, .issue = io_mkdirat, - .cleanup = io_mkdirat_cleanup, }, [IORING_OP_SYMLINKAT] = { - .name = "SYMLINKAT", .prep = io_symlinkat_prep, .issue = io_symlinkat, - .cleanup = io_link_cleanup, }, [IORING_OP_LINKAT] = { - .name = "LINKAT", .prep = io_linkat_prep, .issue = io_linkat, - .cleanup = io_link_cleanup, }, [IORING_OP_MSG_RING] = { .needs_file = 1, .iopoll = 1, - .name = "MSG_RING", .prep = io_msg_ring_prep, .issue = io_msg_ring, - .cleanup = io_msg_ring_cleanup, }, [IORING_OP_FSETXATTR] = { .needs_file = 1, - .name = "FSETXATTR", .prep = io_fsetxattr_prep, .issue = io_fsetxattr, - .cleanup = io_xattr_cleanup, }, [IORING_OP_SETXATTR] = { - .name = "SETXATTR", .prep = io_setxattr_prep, .issue = io_setxattr, - .cleanup = io_xattr_cleanup, }, [IORING_OP_FGETXATTR] = { .needs_file = 1, - .name = "FGETXATTR", .prep = io_fgetxattr_prep, .issue = io_fgetxattr, - .cleanup = io_xattr_cleanup, }, [IORING_OP_GETXATTR] = { - .name = "GETXATTR", .prep = io_getxattr_prep, .issue = io_getxattr, - .cleanup = io_xattr_cleanup, }, [IORING_OP_SOCKET] = { .audit_skip = 1, - .name = "SOCKET", #if defined(CONFIG_NET) .prep = io_socket_prep, .issue = io_socket, @@ -487,16 +396,12 @@ const struct io_op_def io_op_defs[] = { [IORING_OP_URING_CMD] = { .needs_file = 1, .plug = 1, - .name = "URING_CMD", .iopoll = 1, .iopoll_queue = 1, - .async_size = uring_cmd_pdu_size(1), .prep = io_uring_cmd_prep, .issue = io_uring_cmd, - .prep_async = io_uring_cmd_prep_async, }, [IORING_OP_SEND_ZC] = { - .name = "SEND_ZC", .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, @@ -504,32 +409,243 @@ const struct io_op_def io_op_defs[] = { .ioprio = 1, .manual_alloc = 1, #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), .prep = io_send_zc_prep, .issue = io_send_zc, - .prep_async = io_send_prep_async, - .cleanup = io_send_zc_cleanup, - .fail = io_sendrecv_fail, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_SENDMSG_ZC] = { - .name = "SENDMSG_ZC", .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .ioprio = 1, .manual_alloc = 1, #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), .prep = io_send_zc_prep, .issue = io_sendmsg_zc, +#else + .prep = io_eopnotsupp_prep, +#endif + }, +}; + + +const struct io_cold_def io_cold_defs[] = { + [IORING_OP_NOP] = { + .name = "NOP", + }, + [IORING_OP_READV] = { + .async_size = sizeof(struct io_async_rw), + .name = "READV", + .prep_async = io_readv_prep_async, + .cleanup = io_readv_writev_cleanup, + .fail = io_rw_fail, + }, + [IORING_OP_WRITEV] = { + .async_size = sizeof(struct io_async_rw), + .name = "WRITEV", + .prep_async = io_writev_prep_async, + .cleanup = io_readv_writev_cleanup, + .fail = io_rw_fail, + }, + [IORING_OP_FSYNC] = { + .name = "FSYNC", + }, + [IORING_OP_READ_FIXED] = { + .async_size = sizeof(struct io_async_rw), + .name = "READ_FIXED", + .fail = io_rw_fail, + }, + [IORING_OP_WRITE_FIXED] = { + .async_size = sizeof(struct io_async_rw), + .name = "WRITE_FIXED", + .fail = io_rw_fail, + }, + [IORING_OP_POLL_ADD] = { + .name = "POLL_ADD", + }, + [IORING_OP_POLL_REMOVE] = { + .name = "POLL_REMOVE", + }, + [IORING_OP_SYNC_FILE_RANGE] = { + .name = "SYNC_FILE_RANGE", + }, + [IORING_OP_SENDMSG] = { + .name = "SENDMSG", +#if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), + .prep_async = io_sendmsg_prep_async, + .cleanup = io_sendmsg_recvmsg_cleanup, + .fail = io_sendrecv_fail, +#endif + }, + [IORING_OP_RECVMSG] = { + .name = "RECVMSG", +#if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), + .prep_async = io_recvmsg_prep_async, + .cleanup = io_sendmsg_recvmsg_cleanup, + .fail = io_sendrecv_fail, +#endif + }, + [IORING_OP_TIMEOUT] = { + .async_size = sizeof(struct io_timeout_data), + .name = "TIMEOUT", + }, + [IORING_OP_TIMEOUT_REMOVE] = { + .name = "TIMEOUT_REMOVE", + }, + [IORING_OP_ACCEPT] = { + .name = "ACCEPT", + }, + [IORING_OP_ASYNC_CANCEL] = { + .name = "ASYNC_CANCEL", + }, + [IORING_OP_LINK_TIMEOUT] = { + .async_size = sizeof(struct io_timeout_data), + .name = "LINK_TIMEOUT", + }, + [IORING_OP_CONNECT] = { + .name = "CONNECT", +#if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_connect), + .prep_async = io_connect_prep_async, +#endif + }, + [IORING_OP_FALLOCATE] = { + .name = "FALLOCATE", + }, + [IORING_OP_OPENAT] = { + .name = "OPENAT", + .cleanup = io_open_cleanup, + }, + [IORING_OP_CLOSE] = { + .name = "CLOSE", + }, + [IORING_OP_FILES_UPDATE] = { + .name = "FILES_UPDATE", + }, + [IORING_OP_STATX] = { + .name = "STATX", + .cleanup = io_statx_cleanup, + }, + [IORING_OP_READ] = { + .async_size = sizeof(struct io_async_rw), + .name = "READ", + .fail = io_rw_fail, + }, + [IORING_OP_WRITE] = { + .async_size = sizeof(struct io_async_rw), + .name = "WRITE", + .fail = io_rw_fail, + }, + [IORING_OP_FADVISE] = { + .name = "FADVISE", + }, + [IORING_OP_MADVISE] = { + .name = "MADVISE", + }, + [IORING_OP_SEND] = { + .name = "SEND", +#if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), + .fail = io_sendrecv_fail, + .prep_async = io_send_prep_async, +#endif + }, + [IORING_OP_RECV] = { + .name = "RECV", +#if defined(CONFIG_NET) + .fail = io_sendrecv_fail, +#endif + }, + [IORING_OP_OPENAT2] = { + .name = "OPENAT2", + .cleanup = io_open_cleanup, + }, + [IORING_OP_EPOLL_CTL] = { + .name = "EPOLL", + }, + [IORING_OP_SPLICE] = { + .name = "SPLICE", + }, + [IORING_OP_PROVIDE_BUFFERS] = { + .name = "PROVIDE_BUFFERS", + }, + [IORING_OP_REMOVE_BUFFERS] = { + .name = "REMOVE_BUFFERS", + }, + [IORING_OP_TEE] = { + .name = "TEE", + }, + [IORING_OP_SHUTDOWN] = { + .name = "SHUTDOWN", + }, + [IORING_OP_RENAMEAT] = { + .name = "RENAMEAT", + .cleanup = io_renameat_cleanup, + }, + [IORING_OP_UNLINKAT] = { + .name = "UNLINKAT", + .cleanup = io_unlinkat_cleanup, + }, + [IORING_OP_MKDIRAT] = { + .name = "MKDIRAT", + .cleanup = io_mkdirat_cleanup, + }, + [IORING_OP_SYMLINKAT] = { + .name = "SYMLINKAT", + .cleanup = io_link_cleanup, + }, + [IORING_OP_LINKAT] = { + .name = "LINKAT", + .cleanup = io_link_cleanup, + }, + [IORING_OP_MSG_RING] = { + .name = "MSG_RING", + .cleanup = io_msg_ring_cleanup, + }, + [IORING_OP_FSETXATTR] = { + .name = "FSETXATTR", + .cleanup = io_xattr_cleanup, + }, + [IORING_OP_SETXATTR] = { + .name = "SETXATTR", + .cleanup = io_xattr_cleanup, + }, + [IORING_OP_FGETXATTR] = { + .name = "FGETXATTR", + .cleanup = io_xattr_cleanup, + }, + [IORING_OP_GETXATTR] = { + .name = "GETXATTR", + .cleanup = io_xattr_cleanup, + }, + [IORING_OP_SOCKET] = { + .name = "SOCKET", + }, + [IORING_OP_URING_CMD] = { + .name = "URING_CMD", + .async_size = 2 * sizeof(struct io_uring_sqe), + .prep_async = io_uring_cmd_prep_async, + }, + [IORING_OP_SEND_ZC] = { + .name = "SEND_ZC", +#if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), + .prep_async = io_send_prep_async, + .cleanup = io_send_zc_cleanup, + .fail = io_sendrecv_fail, +#endif + }, + [IORING_OP_SENDMSG_ZC] = { + .name = "SENDMSG_ZC", +#if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep_async = io_sendmsg_prep_async, .cleanup = io_send_zc_cleanup, .fail = io_sendrecv_fail, -#else - .prep = io_eopnotsupp_prep, #endif }, }; @@ -537,7 +653,7 @@ const struct io_op_def io_op_defs[] = { const char *io_uring_get_opcode(u8 opcode) { if (opcode < IORING_OP_LAST) - return io_op_defs[opcode].name; + return io_cold_defs[opcode].name; return "INVALID"; } @@ -545,12 +661,13 @@ void __init io_uring_optable_init(void) { int i; - BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); + BUILD_BUG_ON(ARRAY_SIZE(io_cold_defs) != IORING_OP_LAST); + BUILD_BUG_ON(ARRAY_SIZE(io_issue_defs) != IORING_OP_LAST); - for (i = 0; i < ARRAY_SIZE(io_op_defs); i++) { - BUG_ON(!io_op_defs[i].prep); - if (io_op_defs[i].prep != io_eopnotsupp_prep) - BUG_ON(!io_op_defs[i].issue); - WARN_ON_ONCE(!io_op_defs[i].name); + for (i = 0; i < ARRAY_SIZE(io_issue_defs); i++) { + BUG_ON(!io_issue_defs[i].prep); + if (io_issue_defs[i].prep != io_eopnotsupp_prep) + BUG_ON(!io_issue_defs[i].issue); + WARN_ON_ONCE(!io_cold_defs[i].name); } } diff --git a/io_uring/opdef.h b/io_uring/opdef.h index df7e13d9bfba..c22c8696e749 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -2,7 +2,7 @@ #ifndef IOU_OP_DEF_H #define IOU_OP_DEF_H -struct io_op_def { +struct io_issue_def { /* needs req->file assigned */ unsigned needs_file : 1; /* should block plug */ @@ -29,19 +29,24 @@ struct io_op_def { unsigned iopoll_queue : 1; /* opcode specific path will handle ->async_data allocation if needed */ unsigned manual_alloc : 1; + + int (*issue)(struct io_kiocb *, unsigned int); + int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); +}; + +struct io_cold_def { /* size of async data needed, if any */ unsigned short async_size; const char *name; - int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); - int (*issue)(struct io_kiocb *, unsigned int); int (*prep_async)(struct io_kiocb *); void (*cleanup)(struct io_kiocb *); void (*fail)(struct io_kiocb *); }; -extern const struct io_op_def io_op_defs[]; +extern const struct io_issue_def io_issue_defs[]; +extern const struct io_cold_def io_cold_defs[]; void io_uring_optable_init(void); #endif diff --git a/io_uring/openclose.c b/io_uring/openclose.c index 67178e4bb282..e3fae26e025d 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -31,6 +31,17 @@ struct io_close { u32 file_slot; }; +static bool io_openat_force_async(struct io_open *open) +{ + /* + * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, + * it'll always -EAGAIN. Note that we test for __O_TMPFILE because + * O_TMPFILE includes O_DIRECTORY, which isn't a flag we need to force + * async for. + */ + return open->how.flags & (O_TRUNC | O_CREAT | __O_TMPFILE); +} + static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_open *open = io_kiocb_to_cmd(req, struct io_open); @@ -61,6 +72,8 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe open->nofile = rlimit(RLIMIT_NOFILE); req->flags |= REQ_F_NEED_CLEANUP; + if (io_openat_force_async(open)) + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -108,12 +121,7 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags) nonblock_set = op.open_flag & O_NONBLOCK; resolve_nonblock = open->how.resolve & RESOLVE_CACHED; if (issue_flags & IO_URING_F_NONBLOCK) { - /* - * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, - * it'll always -EAGAIN - */ - if (open->how.flags & (O_TRUNC | O_CREAT | O_TMPFILE)) - return -EAGAIN; + WARN_ON_ONCE(io_openat_force_async(open)); op.lookup_flags |= LOOKUP_CACHED; op.open_flag |= O_NONBLOCK; } @@ -144,7 +152,6 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags) if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) file->f_flags &= ~O_NONBLOCK; - fsnotify_open(file); if (!fixed) fd_install(ret, file); diff --git a/io_uring/poll.c b/io_uring/poll.c index 98722021742f..4c360ba8793a 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -51,6 +51,9 @@ struct io_poll_table { #define IO_WQE_F_DOUBLE 1 +static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, + void *key); + static inline struct io_kiocb *wqe_to_req(struct wait_queue_entry *wqe) { unsigned long priv = (unsigned long)wqe->private; @@ -145,7 +148,7 @@ static void io_poll_req_insert_locked(struct io_kiocb *req) hlist_add_head(&req->hash_node, &table->hbs[index].list); } -static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked) +static void io_poll_tw_hash_eject(struct io_kiocb *req, struct io_tw_state *ts) { struct io_ring_ctx *ctx = req->ctx; @@ -156,7 +159,7 @@ static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked) * already grabbed the mutex for us, but there is a chance it * failed. */ - io_tw_lock(ctx, locked); + io_tw_lock(ctx, ts); hash_del(&req->hash_node); req->flags &= ~REQ_F_HASH_LOCKED; } else { @@ -164,15 +167,14 @@ static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked) } } -static void io_init_poll_iocb(struct io_poll *poll, __poll_t events, - wait_queue_func_t wake_func) +static void io_init_poll_iocb(struct io_poll *poll, __poll_t events) { poll->head = NULL; #define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) /* mask in events that we always want/need */ poll->events = events | IO_POLL_UNMASK; INIT_LIST_HEAD(&poll->wait.entry); - init_waitqueue_func_entry(&poll->wait, wake_func); + init_waitqueue_func_entry(&poll->wait, io_poll_wake); } static inline void io_poll_remove_entry(struct io_poll *poll) @@ -236,7 +238,7 @@ enum { * req->cqe.res. IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot * poll and that the result is stored in req->cqe. */ -static int io_poll_check_events(struct io_kiocb *req, bool *locked) +static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) { int v; @@ -298,13 +300,13 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events); - if (!io_aux_cqe(req->ctx, *locked, req->cqe.user_data, - mask, IORING_CQE_F_MORE, false)) { + if (!io_fill_cqe_req_aux(req, ts->locked, mask, + IORING_CQE_F_MORE)) { io_req_set_res(req, mask, 0); return IOU_POLL_REMOVE_POLL_USE_RES; } } else { - int ret = io_poll_issue(req, locked); + int ret = io_poll_issue(req, ts); if (ret == IOU_STOP_MULTISHOT) return IOU_POLL_REMOVE_POLL_USE_RES; if (ret < 0) @@ -324,15 +326,15 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) return IOU_POLL_NO_ACTION; } -static void io_poll_task_func(struct io_kiocb *req, bool *locked) +void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) { int ret; - ret = io_poll_check_events(req, locked); + ret = io_poll_check_events(req, ts); if (ret == IOU_POLL_NO_ACTION) return; io_poll_remove_entries(req); - io_poll_tw_hash_eject(req, locked); + io_poll_tw_hash_eject(req, ts); if (req->opcode == IORING_OP_POLL_ADD) { if (ret == IOU_POLL_DONE) { @@ -341,7 +343,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) poll = io_kiocb_to_cmd(req, struct io_poll); req->cqe.res = mangle_poll(req->cqe.res & poll->events); } else if (ret == IOU_POLL_REISSUE) { - io_req_task_submit(req, locked); + io_req_task_submit(req, ts); return; } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { req->cqe.res = ret; @@ -349,14 +351,14 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) } io_req_set_res(req, req->cqe.res, 0); - io_req_task_complete(req, locked); + io_req_task_complete(req, ts); } else { - io_tw_lock(req->ctx, locked); + io_tw_lock(req->ctx, ts); if (ret == IOU_POLL_REMOVE_POLL_USE_RES) - io_req_task_complete(req, locked); + io_req_task_complete(req, ts); else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE) - io_req_task_submit(req, locked); + io_req_task_submit(req, ts); else io_req_defer_failed(req, ret); } @@ -508,7 +510,7 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt, /* mark as double wq entry */ wqe_private |= IO_WQE_F_DOUBLE; - io_init_poll_iocb(poll, first->events, first->wait.func); + io_init_poll_iocb(poll, first->events); if (!io_poll_double_prepare(req)) { /* the request is completing, just back off */ kfree(poll); @@ -569,7 +571,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, INIT_HLIST_NODE(&req->hash_node); req->work.cancel_seq = atomic_read(&ctx->cancel_seq); - io_init_poll_iocb(poll, mask, io_poll_wake); + io_init_poll_iocb(poll, mask); poll->file = req->file; req->apoll_events = poll->events; @@ -690,7 +692,7 @@ alloc_apoll: int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) { - const struct io_op_def *def = &io_op_defs[req->opcode]; + const struct io_issue_def *def = &io_issue_defs[req->opcode]; struct async_poll *apoll; struct io_poll_table ipt; __poll_t mask = POLLPRI | POLLERR | EPOLLET; @@ -822,14 +824,10 @@ static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, spin_lock(&hb->lock); hlist_for_each_entry(req, &hb->list, hash_node) { - if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && - req->file != cd->file) - continue; - if (cd->seq == req->work.cancel_seq) - continue; - req->work.cancel_seq = cd->seq; - *out_bucket = hb; - return req; + if (io_cancel_req_match(req, cd)) { + *out_bucket = hb; + return req; + } } spin_unlock(&hb->lock); } @@ -853,7 +851,8 @@ static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, struct io_hash_bucket *bucket; struct io_kiocb *req; - if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) + if (cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP | + IORING_ASYNC_CANCEL_ANY)) req = io_poll_file_find(ctx, cd, table, &bucket); else req = io_poll_find(ctx, false, cd, table, &bucket); @@ -970,12 +969,12 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) { struct io_poll_update *poll_update = io_kiocb_to_cmd(req, struct io_poll_update); - struct io_cancel_data cd = { .data = poll_update->old_user_data, }; struct io_ring_ctx *ctx = req->ctx; + struct io_cancel_data cd = { .ctx = ctx, .data = poll_update->old_user_data, }; struct io_hash_bucket *bucket; struct io_kiocb *preq; int ret2, ret = 0; - bool locked = true; + struct io_tw_state ts = { .locked = true }; io_ring_submit_lock(ctx, issue_flags); preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket); @@ -1024,7 +1023,7 @@ found: req_set_fail(preq); io_req_set_res(preq, -ECANCELED, 0); - io_req_task_complete(preq, &locked); + io_req_task_complete(preq, &ts); out: io_ring_submit_unlock(ctx, issue_flags); if (ret < 0) { diff --git a/io_uring/poll.h b/io_uring/poll.h index b2393b403a2c..ff4d5d753387 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -38,3 +38,5 @@ bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, bool cancel_all); void io_apoll_cache_free(struct io_cache_entry *entry); + +void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index aa150102ea2f..2b5677b03f94 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -23,24 +23,21 @@ struct io_rsrc_update { u32 offset; }; +static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); +static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, struct io_mapped_ubuf **pimu, struct page **last_hpage); -#define IO_RSRC_REF_BATCH 100 - /* only define max */ #define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_REG_BUFFERS (1U << 14) -void io_rsrc_refs_drop(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - if (ctx->rsrc_cached_refs) { - io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs); - ctx->rsrc_cached_refs = 0; - } -} +static const struct io_mapped_ubuf dummy_ubuf = { + /* set invalid range, so io_import_fixed() fails meeting it */ + .ubuf = -1UL, + .ubuf_end = 0, +}; int __io_account_mem(struct user_struct *user, unsigned long nr_pages) { @@ -141,7 +138,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo struct io_mapped_ubuf *imu = *slot; unsigned int i; - if (imu != ctx->dummy_ubuf) { + if (imu != &dummy_ubuf) { for (i = 0; i < imu->nr_bvecs; i++) unpin_user_page(imu->bvec[i].bv_page); if (imu->acct_pages) @@ -151,216 +148,129 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo *slot = NULL; } -void io_rsrc_refs_refill(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) +static void io_rsrc_put_work(struct io_rsrc_node *node) { - ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH; - percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH); -} + struct io_rsrc_put *prsrc = &node->item; -static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) -{ - struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; - struct io_ring_ctx *ctx = rsrc_data->ctx; - struct io_rsrc_put *prsrc, *tmp; + if (prsrc->tag) + io_post_aux_cqe(node->ctx, prsrc->tag, 0, 0); - list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) { - list_del(&prsrc->list); - - if (prsrc->tag) { - if (ctx->flags & IORING_SETUP_IOPOLL) { - mutex_lock(&ctx->uring_lock); - io_post_aux_cqe(ctx, prsrc->tag, 0, 0); - mutex_unlock(&ctx->uring_lock); - } else { - io_post_aux_cqe(ctx, prsrc->tag, 0, 0); - } - } - - rsrc_data->do_put(ctx, prsrc); - kfree(prsrc); - } - - io_rsrc_node_destroy(ref_node); - if (atomic_dec_and_test(&rsrc_data->refs)) - complete(&rsrc_data->done); -} - -void io_rsrc_put_work(struct work_struct *work) -{ - struct io_ring_ctx *ctx; - struct llist_node *node; - - ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work); - node = llist_del_all(&ctx->rsrc_put_llist); - - while (node) { - struct io_rsrc_node *ref_node; - struct llist_node *next = node->next; - - ref_node = llist_entry(node, struct io_rsrc_node, llist); - __io_rsrc_put_work(ref_node); - node = next; + switch (node->type) { + case IORING_RSRC_FILE: + io_rsrc_file_put(node->ctx, prsrc); + break; + case IORING_RSRC_BUFFER: + io_rsrc_buf_put(node->ctx, prsrc); + break; + default: + WARN_ON_ONCE(1); + break; } } -void io_rsrc_put_tw(struct callback_head *cb) +void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { - struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx, - rsrc_put_tw); - - io_rsrc_put_work(&ctx->rsrc_put_work.work); + if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache)) + kfree(node); } -void io_wait_rsrc_data(struct io_rsrc_data *data) +void io_rsrc_node_ref_zero(struct io_rsrc_node *node) + __must_hold(&node->ctx->uring_lock) { - if (data && !atomic_dec_and_test(&data->refs)) - wait_for_completion(&data->done); -} - -void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) -{ - percpu_ref_exit(&ref_node->refs); - kfree(ref_node); -} - -static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) -{ - struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); - struct io_ring_ctx *ctx = node->rsrc_data->ctx; - unsigned long flags; - bool first_add = false; - unsigned long delay = HZ; - - spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); - node->done = true; - - /* if we are mid-quiesce then do not delay */ - if (node->rsrc_data->quiesce) - delay = 0; + struct io_ring_ctx *ctx = node->ctx; while (!list_empty(&ctx->rsrc_ref_list)) { node = list_first_entry(&ctx->rsrc_ref_list, struct io_rsrc_node, node); /* recycle ref nodes in order */ - if (!node->done) + if (node->refs) break; list_del(&node->node); - first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); - } - spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); - if (!first_add) - return; - - if (ctx->submitter_task) { - if (!task_work_add(ctx->submitter_task, &ctx->rsrc_put_tw, - ctx->notify_method)) - return; + if (likely(!node->empty)) + io_rsrc_put_work(node); + io_rsrc_node_destroy(ctx, node); } - mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); + if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce)) + wake_up_all(&ctx->rsrc_quiesce_wq); } -static struct io_rsrc_node *io_rsrc_node_alloc(void) +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) { struct io_rsrc_node *ref_node; + struct io_cache_entry *entry; - ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); - if (!ref_node) - return NULL; - - if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, - 0, GFP_KERNEL)) { - kfree(ref_node); - return NULL; + entry = io_alloc_cache_get(&ctx->rsrc_node_cache); + if (entry) { + ref_node = container_of(entry, struct io_rsrc_node, cache); + } else { + ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); + if (!ref_node) + return NULL; } - INIT_LIST_HEAD(&ref_node->node); - INIT_LIST_HEAD(&ref_node->rsrc_list); - ref_node->done = false; + + ref_node->ctx = ctx; + ref_node->empty = 0; + ref_node->refs = 1; return ref_node; } -void io_rsrc_node_switch(struct io_ring_ctx *ctx, - struct io_rsrc_data *data_to_kill) - __must_hold(&ctx->uring_lock) -{ - WARN_ON_ONCE(!ctx->rsrc_backup_node); - WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); - - io_rsrc_refs_drop(ctx); - - if (data_to_kill) { - struct io_rsrc_node *rsrc_node = ctx->rsrc_node; - - rsrc_node->rsrc_data = data_to_kill; - spin_lock_irq(&ctx->rsrc_ref_lock); - list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); - spin_unlock_irq(&ctx->rsrc_ref_lock); - - atomic_inc(&data_to_kill->refs); - percpu_ref_kill(&rsrc_node->refs); - ctx->rsrc_node = NULL; - } - - if (!ctx->rsrc_node) { - ctx->rsrc_node = ctx->rsrc_backup_node; - ctx->rsrc_backup_node = NULL; - } -} - -int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) -{ - if (ctx->rsrc_backup_node) - return 0; - ctx->rsrc_backup_node = io_rsrc_node_alloc(); - return ctx->rsrc_backup_node ? 0 : -ENOMEM; -} - __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx) { + struct io_rsrc_node *backup; + DEFINE_WAIT(we); int ret; - /* As we may drop ->uring_lock, other task may have started quiesce */ + /* As We may drop ->uring_lock, other task may have started quiesce */ if (data->quiesce) return -ENXIO; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - return ret; - io_rsrc_node_switch(ctx, data); - /* kill initial ref, already quiesced if zero */ - if (atomic_dec_and_test(&data->refs)) + backup = io_rsrc_node_alloc(ctx); + if (!backup) + return -ENOMEM; + ctx->rsrc_node->empty = true; + ctx->rsrc_node->type = -1; + list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list); + io_put_rsrc_node(ctx, ctx->rsrc_node); + ctx->rsrc_node = backup; + + if (list_empty(&ctx->rsrc_ref_list)) return 0; + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + atomic_set(&ctx->cq_wait_nr, 1); + smp_mb(); + } + + ctx->rsrc_quiesce++; data->quiesce = true; - mutex_unlock(&ctx->uring_lock); do { + prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE); + mutex_unlock(&ctx->uring_lock); + ret = io_run_task_work_sig(ctx); if (ret < 0) { - atomic_inc(&data->refs); - /* wait for all works potentially completing data->done */ - flush_delayed_work(&ctx->rsrc_put_work); - reinit_completion(&data->done); mutex_lock(&ctx->uring_lock); + if (list_empty(&ctx->rsrc_ref_list)) + ret = 0; break; } - flush_delayed_work(&ctx->rsrc_put_work); - ret = wait_for_completion_interruptible(&data->done); - if (!ret) { - mutex_lock(&ctx->uring_lock); - if (atomic_read(&data->refs) <= 0) - break; - /* - * it has been revived by another thread while - * we were unlocked - */ - mutex_unlock(&ctx->uring_lock); - } - } while (1); - data->quiesce = false; + schedule(); + __set_current_state(TASK_RUNNING); + mutex_lock(&ctx->uring_lock); + ret = 0; + } while (!list_empty(&ctx->rsrc_ref_list)); + finish_wait(&ctx->rsrc_quiesce_wq, &we); + data->quiesce = false; + ctx->rsrc_quiesce--; + + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { + atomic_set(&ctx->cq_wait_nr, 0); + smp_mb(); + } return ret; } @@ -405,12 +315,12 @@ static __cold void **io_alloc_page_table(size_t size) return table; } -__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, - rsrc_put_fn *do_put, u64 __user *utags, +__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type, + u64 __user *utags, unsigned nr, struct io_rsrc_data **pdata) { struct io_rsrc_data *data; - int ret = -ENOMEM; + int ret = 0; unsigned i; data = kzalloc(sizeof(*data), GFP_KERNEL); @@ -424,7 +334,7 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, data->nr = nr; data->ctx = ctx; - data->do_put = do_put; + data->rsrc_type = type; if (utags) { ret = -EFAULT; for (i = 0; i < nr; i++) { @@ -435,9 +345,6 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, goto fail; } } - - atomic_set(&data->refs, 1); - init_completion(&data->done); *pdata = data; return 0; fail: @@ -453,10 +360,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, __s32 __user *fds = u64_to_user_ptr(up->data); struct io_rsrc_data *data = ctx->file_data; struct io_fixed_file *file_slot; - struct file *file; int fd, i, err = 0; unsigned int done; - bool needs_switch = false; if (!ctx->file_data) return -ENXIO; @@ -482,16 +387,16 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, file_slot = io_fixed_file_slot(&ctx->file_table, i); if (file_slot->file_ptr) { - file = (struct file *)(file_slot->file_ptr & FFS_MASK); - err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file); + err = io_queue_rsrc_removal(data, i, + io_slot_file(file_slot)); if (err) break; file_slot->file_ptr = 0; io_file_bitmap_clear(&ctx->file_table, i); - needs_switch = true; } if (fd != -1) { - file = fget(fd); + struct file *file = fget(fd); + if (!file) { err = -EBADF; break; @@ -519,9 +424,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, io_file_bitmap_set(&ctx->file_table, i); } } - - if (needs_switch) - io_rsrc_node_switch(ctx, data); return done ? done : err; } @@ -532,7 +434,6 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, u64 __user *tags = u64_to_user_ptr(up->tags); struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); struct page *last_hpage = NULL; - bool needs_switch = false; __u32 done; int i, err; @@ -543,7 +444,6 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, for (done = 0; done < nr_args; done++) { struct io_mapped_ubuf *imu; - int offset = up->offset + done; u64 tag = 0; err = io_copy_iov(ctx, &iov, iovs, done); @@ -564,24 +464,20 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, if (err) break; - i = array_index_nospec(offset, ctx->nr_user_bufs); - if (ctx->user_bufs[i] != ctx->dummy_ubuf) { + i = array_index_nospec(up->offset + done, ctx->nr_user_bufs); + if (ctx->user_bufs[i] != &dummy_ubuf) { err = io_queue_rsrc_removal(ctx->buf_data, i, - ctx->rsrc_node, ctx->user_bufs[i]); + ctx->user_bufs[i]); if (unlikely(err)) { io_buffer_unmap(ctx, &imu); break; } - ctx->user_bufs[i] = ctx->dummy_ubuf; - needs_switch = true; + ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf; } ctx->user_bufs[i] = imu; *io_get_tag_slot(ctx->buf_data, i) = tag; } - - if (needs_switch) - io_rsrc_node_switch(ctx, ctx->buf_data); return done ? done : err; } @@ -590,13 +486,11 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, unsigned nr_args) { __u32 tmp; - int err; + + lockdep_assert_held(&ctx->uring_lock); if (check_add_overflow(up->offset, nr_args, &tmp)) return -EOVERFLOW; - err = io_rsrc_node_switch_start(ctx); - if (err) - return err; switch (type) { case IORING_RSRC_FILE: @@ -753,20 +647,24 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, - struct io_rsrc_node *node, void *rsrc) +int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc) { + struct io_ring_ctx *ctx = data->ctx; + struct io_rsrc_node *node = ctx->rsrc_node; u64 *tag_slot = io_get_tag_slot(data, idx); - struct io_rsrc_put *prsrc; - prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); - if (!prsrc) + ctx->rsrc_node = io_rsrc_node_alloc(ctx); + if (unlikely(!ctx->rsrc_node)) { + ctx->rsrc_node = node; return -ENOMEM; + } - prsrc->tag = *tag_slot; + node->item.rsrc = rsrc; + node->type = data->rsrc_type; + node->item.tag = *tag_slot; *tag_slot = 0; - prsrc->rsrc = rsrc; - list_add(&prsrc->list, &node->rsrc_list); + list_add_tail(&node->node, &ctx->rsrc_ref_list); + io_put_rsrc_node(ctx, node); return 0; } @@ -881,20 +779,14 @@ int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) return 0; } -static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) +static __cold void io_rsrc_file_scm_put(struct io_ring_ctx *ctx, struct file *file) { - struct file *file = prsrc->file; #if defined(CONFIG_UNIX) struct sock *sock = ctx->ring_sock->sk; struct sk_buff_head list, *head = &sock->sk_receive_queue; struct sk_buff *skb; int i; - if (!io_file_need_scm(file)) { - fput(file); - return; - } - __skb_queue_head_init(&list); /* @@ -944,11 +836,19 @@ static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) __skb_queue_tail(head, skb); spin_unlock_irq(&head->lock); } -#else - fput(file); #endif } +static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) +{ + struct file *file = prsrc->file; + + if (likely(!io_file_need_scm(file))) + fput(file); + else + io_rsrc_file_scm_put(ctx, file); +} + int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args, u64 __user *tags) { @@ -965,10 +865,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return -EMFILE; if (nr_args > rlimit(RLIMIT_NOFILE)) return -EMFILE; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - return ret; - ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args, + ret = io_rsrc_data_alloc(ctx, IORING_RSRC_FILE, tags, nr_args, &ctx->file_data); if (ret) return ret; @@ -1022,7 +919,6 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, /* default it to the whole table */ io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files); - io_rsrc_node_switch(ctx, NULL); return 0; fail: __io_sqe_files_unregister(ctx); @@ -1207,8 +1103,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, unsigned long off; size_t size; int ret, nr_pages, i; + struct folio *folio = NULL; - *pimu = ctx->dummy_ubuf; + *pimu = (struct io_mapped_ubuf *)&dummy_ubuf; if (!iov->iov_base) return 0; @@ -1221,6 +1118,32 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, goto done; } + /* If it's a huge page, try to coalesce them into a single bvec entry */ + if (nr_pages > 1) { + folio = page_folio(pages[0]); + for (i = 1; i < nr_pages; i++) { + /* + * Pages must be consecutive and on the same folio for + * this to work + */ + if (page_folio(pages[i]) != folio || + pages[i] != pages[i - 1] + 1) { + folio = NULL; + break; + } + } + if (folio) { + /* + * The pages are bound to the folio, it doesn't + * actually unpin them but drops all but one reference, + * which is usually put down by io_buffer_unmap(). + * Note, needs a better helper. + */ + unpin_user_pages(&pages[1], nr_pages - 1); + nr_pages = 1; + } + } + imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); if (!imu) goto done; @@ -1233,22 +1156,25 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, off = (unsigned long) iov->iov_base & ~PAGE_MASK; size = iov->iov_len; - for (i = 0; i < nr_pages; i++) { - size_t vec_len; - - vec_len = min_t(size_t, size, PAGE_SIZE - off); - imu->bvec[i].bv_page = pages[i]; - imu->bvec[i].bv_len = vec_len; - imu->bvec[i].bv_offset = off; - off = 0; - size -= vec_len; - } /* store original address for later verification */ imu->ubuf = (unsigned long) iov->iov_base; imu->ubuf_end = imu->ubuf + iov->iov_len; imu->nr_bvecs = nr_pages; *pimu = imu; ret = 0; + + if (folio) { + bvec_set_page(&imu->bvec[0], pages[0], size, off); + goto done; + } + for (i = 0; i < nr_pages; i++) { + size_t vec_len; + + vec_len = min_t(size_t, size, PAGE_SIZE - off); + bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); + off = 0; + size -= vec_len; + } done: if (ret) kvfree(imu); @@ -1276,10 +1202,7 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, return -EBUSY; if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) return -EINVAL; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - return ret; - ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); + ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, tags, nr_args, &data); if (ret) return ret; ret = io_buffers_map_alloc(ctx, nr_args); @@ -1316,8 +1239,6 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, ctx->buf_data = data; if (ret) __io_sqe_buffers_unregister(ctx); - else - io_rsrc_node_switch(ctx, NULL); return ret; } @@ -1337,7 +1258,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter, return -EFAULT; /* - * May not be a start of buffer, set size appropriately + * Might not be a start of buffer, set size appropriately * and advance us to the beginning. */ offset = buf_addr - imu->ubuf; @@ -1363,7 +1284,15 @@ int io_import_fixed(int ddir, struct iov_iter *iter, const struct bio_vec *bvec = imu->bvec; if (offset <= bvec->bv_len) { - iov_iter_advance(iter, offset); + /* + * Note, huge pages buffers consists of one large + * bvec entry and should always go this way. The other + * branch doesn't expect non PAGE_SIZE'd chunks. + */ + iter->bvec = bvec; + iter->nr_segs = bvec->bv_len; + iter->count -= offset; + iter->iov_offset = offset; } else { unsigned long seg_skip; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index f27f4975217d..8afa9ec66a55 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -4,6 +4,10 @@ #include +#include "alloc_cache.h" + +#define IO_NODE_ALLOC_CACHE_MAX 32 + #define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) #define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) #define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) @@ -14,7 +18,6 @@ enum { }; struct io_rsrc_put { - struct list_head list; u64 tag; union { void *rsrc; @@ -30,19 +33,20 @@ struct io_rsrc_data { u64 **tags; unsigned int nr; - rsrc_put_fn *do_put; - atomic_t refs; - struct completion done; + u16 rsrc_type; bool quiesce; }; struct io_rsrc_node { - struct percpu_ref refs; + union { + struct io_cache_entry cache; + struct io_ring_ctx *ctx; + }; + int refs; + bool empty; + u16 type; struct list_head node; - struct list_head rsrc_list; - struct io_rsrc_data *rsrc_data; - struct llist_node llist; - bool done; + struct io_rsrc_put item; }; struct io_mapped_ubuf { @@ -53,17 +57,10 @@ struct io_mapped_ubuf { struct bio_vec bvec[]; }; -void io_rsrc_put_tw(struct callback_head *cb); -void io_rsrc_put_work(struct work_struct *work); -void io_rsrc_refs_refill(struct io_ring_ctx *ctx); -void io_wait_rsrc_data(struct io_rsrc_data *data); -void io_rsrc_node_destroy(struct io_rsrc_node *ref_node); -void io_rsrc_refs_drop(struct io_ring_ctx *ctx); -int io_rsrc_node_switch_start(struct io_ring_ctx *ctx); -int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, - struct io_rsrc_node *node, void *rsrc); -void io_rsrc_node_switch(struct io_ring_ctx *ctx, - struct io_rsrc_data *data_to_kill); +void io_rsrc_node_ref_zero(struct io_rsrc_node *node); +void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node); +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx); +int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc); int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, @@ -107,36 +104,24 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type); -static inline void io_rsrc_put_node(struct io_rsrc_node *node, int nr) +static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { - percpu_ref_put_many(&node->refs, nr); -} + lockdep_assert_held(&ctx->uring_lock); -static inline void io_req_put_rsrc(struct io_kiocb *req) -{ - if (req->rsrc_node) - io_rsrc_put_node(req->rsrc_node, 1); + if (node && !--node->refs) + io_rsrc_node_ref_zero(node); } static inline void io_req_put_rsrc_locked(struct io_kiocb *req, struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) { - struct io_rsrc_node *node = req->rsrc_node; - - if (node) { - if (node == ctx->rsrc_node) - ctx->rsrc_cached_refs++; - else - io_rsrc_put_node(node, 1); - } + io_put_rsrc_node(ctx, req->rsrc_node); } -static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx) +static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx, + struct io_rsrc_node *node) { - ctx->rsrc_cached_refs--; - if (unlikely(ctx->rsrc_cached_refs < 0)) - io_rsrc_refs_refill(ctx); + node->refs++; } static inline void io_req_set_rsrc_node(struct io_kiocb *req, @@ -149,7 +134,7 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req, lockdep_assert_held(&ctx->uring_lock); req->rsrc_node = ctx->rsrc_node; - io_charge_rsrc_node(ctx); + io_charge_rsrc_node(ctx, ctx->rsrc_node); io_ring_submit_unlock(ctx, issue_flags); } } @@ -162,6 +147,12 @@ static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx) return &data->tags[table_idx][off]; } +static inline int io_rsrc_init(struct io_ring_ctx *ctx) +{ + ctx->rsrc_node = io_rsrc_node_alloc(ctx); + return ctx->rsrc_node ? 0 : -ENOMEM; +} + int io_files_update(struct io_kiocb *req, unsigned int issue_flags); int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); diff --git a/io_uring/rw.c b/io_uring/rw.c index d058a1acb5ca..42231ee7c689 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -105,6 +105,7 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) } else { rw->kiocb.ki_ioprio = get_current_ioprio(); } + rw->kiocb.dio_complete = NULL; rw->addr = READ_ONCE(sqe->addr); rw->len = READ_ONCE(sqe->len); @@ -283,16 +284,25 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res) return res; } -static void io_req_rw_complete(struct io_kiocb *req, bool *locked) +void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) { + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + struct kiocb *kiocb = &rw->kiocb; + + if ((kiocb->ki_flags & IOCB_DIO_CALLER_COMP) && kiocb->dio_complete) { + long res = kiocb->dio_complete(rw->kiocb.private); + + io_req_set_res(req, io_fixup_rw_res(req, res), 0); + } + io_req_io_end(req); if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { - unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED; + unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; req->cqe.flags |= io_put_kbuf(req, issue_flags); } - io_req_task_complete(req, locked); + io_req_task_complete(req, ts); } static void io_complete_rw(struct kiocb *kiocb, long res) @@ -300,11 +310,13 @@ static void io_complete_rw(struct kiocb *kiocb, long res) struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); struct io_kiocb *req = cmd_to_io_kiocb(rw); - if (__io_complete_rw_common(req, res)) - return; - io_req_set_res(req, io_fixup_rw_res(req, res), 0); + if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) { + if (__io_complete_rw_common(req, res)) + return; + io_req_set_res(req, io_fixup_rw_res(req, res), 0); + } req->io_task_work.func = io_req_rw_complete; - io_req_task_work_add(req); + __io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) @@ -332,7 +344,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); unsigned final_ret = io_fixup_rw_res(req, ret); - if (req->flags & REQ_F_CUR_POS) + if (ret >= 0 && req->flags & REQ_F_CUR_POS) req->file->f_pos = rw->kiocb.ki_pos; if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) { if (!__io_complete_rw_common(req, ret)) { @@ -391,7 +403,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req, rw->len = sqe_len; } - ret = import_single_range(ddir, buf, sqe_len, s->fast_iov, iter); + ret = import_ubuf(ddir, buf, sqe_len, iter); if (ret) return ERR_PTR(ret); return NULL; @@ -410,7 +422,7 @@ static inline int io_import_iovec(int rw, struct io_kiocb *req, unsigned int issue_flags) { *iovec = __io_import_iovec(rw, req, s, issue_flags); - if (unlikely(IS_ERR(*iovec))) + if (IS_ERR(*iovec)) return PTR_ERR(*iovec); iov_iter_save_state(&s->iter, &s->iter_state); @@ -447,23 +459,25 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) ppos = io_kiocb_ppos(kiocb); while (iov_iter_count(iter)) { - struct iovec iovec; + void __user *addr; + size_t len; ssize_t nr; - if (!iov_iter_is_bvec(iter)) { - iovec = iov_iter_iovec(iter); + if (iter_is_ubuf(iter)) { + addr = iter->ubuf + iter->iov_offset; + len = iov_iter_count(iter); + } else if (!iov_iter_is_bvec(iter)) { + addr = iter_iov_addr(iter); + len = iter_iov_len(iter); } else { - iovec.iov_base = u64_to_user_ptr(rw->addr); - iovec.iov_len = rw->len; + addr = u64_to_user_ptr(rw->addr); + len = rw->len; } - if (ddir == READ) { - nr = file->f_op->read(file, iovec.iov_base, - iovec.iov_len, ppos); - } else { - nr = file->f_op->write(file, iovec.iov_base, - iovec.iov_len, ppos); - } + if (ddir == READ) + nr = file->f_op->read(file, addr, len, ppos); + else + nr = file->f_op->write(file, addr, len, ppos); if (nr < 0) { if (!ret) @@ -479,7 +493,7 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) if (!rw->len) break; } - if (nr != iovec.iov_len) + if (nr != len) break; } @@ -495,15 +509,15 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, io->free_iovec = iovec; io->bytes_done = 0; /* can only be fixed buffers, no need to do anything */ - if (iov_iter_is_bvec(iter)) + if (iov_iter_is_bvec(iter) || iter_is_ubuf(iter)) return; if (!iovec) { unsigned iov_off = 0; - io->s.iter.iov = io->s.fast_iov; - if (iter->iov != fast_iov) { - iov_off = iter->iov - fast_iov; - io->s.iter.iov += iov_off; + io->s.iter.__iov = io->s.fast_iov; + if (iter->__iov != fast_iov) { + iov_off = iter_iov(iter) - fast_iov; + io->s.iter.__iov += iov_off; } if (io->s.fast_iov != fast_iov) memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off, @@ -516,7 +530,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, struct io_rw_state *s, bool force) { - if (!force && !io_op_defs[req->opcode].prep_async) + if (!force && !io_cold_defs[req->opcode].prep_async) return 0; if (!req_has_async_data(req)) { struct io_async_rw *iorw; @@ -664,8 +678,8 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) if (unlikely(!file || !(file->f_mode & mode))) return -EBADF; - if (!io_req_ffs_set(req)) - req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; + if (!(req->flags & REQ_F_FIXED_FILE)) + req->flags |= io_file_get_flags(file); kiocb->ki_flags = iocb_flags(file); ret = kiocb_set_rw_flags(kiocb, rw->flags); @@ -981,13 +995,6 @@ copy_iov: return ret; } -static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) -{ - io_commit_cqring_flush(ctx); - if (ctx->flags & IORING_SETUP_SQPOLL) - io_cqring_wake(ctx); -} - void io_rw_fail(struct io_kiocb *req) { int res; @@ -1058,24 +1065,17 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) if (!smp_load_acquire(&req->iopoll_completed)) break; nr_events++; - if (unlikely(req->flags & REQ_F_CQE_SKIP)) - continue; - req->cqe.flags = io_put_kbuf(req, 0); - if (unlikely(!__io_fill_cqe_req(ctx, req))) { - spin_lock(&ctx->completion_lock); - io_req_cqe_overflow(req); - spin_unlock(&ctx->completion_lock); - } } - if (unlikely(!nr_events)) return 0; - io_commit_cqring(ctx); - io_cqring_ev_posted_iopoll(ctx); pos = start ? start->next : ctx->iopoll_list.first; wq_list_cut(&ctx->iopoll_list, prev, start); - io_free_batch_list(ctx, pos); + + if (WARN_ON_ONCE(!wq_list_empty(&ctx->submit_state.compl_reqs))) + return 0; + ctx->submit_state.compl_reqs.first = pos; + __io_submit_flush_completions(ctx); return nr_events; } diff --git a/io_uring/rw.h b/io_uring/rw.h index 3b733f4b610a..4b89f9659366 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -22,3 +22,4 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags); int io_writev_prep_async(struct io_kiocb *req); void io_readv_writev_cleanup(struct io_kiocb *req); void io_rw_fail(struct io_kiocb *req); +void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts); diff --git a/io_uring/slist.h b/io_uring/slist.h index f27601fa4660..0eb194817242 100644 --- a/io_uring/slist.h +++ b/io_uring/slist.h @@ -3,6 +3,9 @@ #include +#define __wq_list_for_each(pos, head) \ + for (pos = (head)->first; pos; pos = (pos)->next) + #define wq_list_for_each(pos, prv, head) \ for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) @@ -27,28 +30,6 @@ static inline void wq_list_add_after(struct io_wq_work_node *node, list->last = node; } -/** - * wq_list_merge - merge the second list to the first one. - * @list0: the first list - * @list1: the second list - * Return the first node after mergence. - */ -static inline struct io_wq_work_node *wq_list_merge(struct io_wq_work_list *list0, - struct io_wq_work_list *list1) -{ - struct io_wq_work_node *ret; - - if (!list0->first) { - ret = list1->first; - } else { - ret = list0->first; - list0->last->next = list1->first; - } - INIT_WQ_LIST(list0); - INIT_WQ_LIST(list1); - return ret; -} - static inline void wq_list_add_tail(struct io_wq_work_node *node, struct io_wq_work_list *list) { @@ -135,4 +116,4 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) return container_of(work->list.next, struct io_wq_work, list); } -#endif // INTERNAL_IO_SLIST_H \ No newline at end of file +#endif // INTERNAL_IO_SLIST_H diff --git a/io_uring/splice.c b/io_uring/splice.c index 53e4232d0866..7c4469e9540e 100644 --- a/io_uring/splice.c +++ b/io_uring/splice.c @@ -34,6 +34,7 @@ static int __io_splice_prep(struct io_kiocb *req, if (unlikely(sp->flags & ~valid_flags)) return -EINVAL; sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in); + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -52,8 +53,7 @@ int io_tee(struct io_kiocb *req, unsigned int issue_flags) struct file *in; long ret = 0; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); if (sp->flags & SPLICE_F_FD_IN_FIXED) in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); @@ -68,7 +68,7 @@ int io_tee(struct io_kiocb *req, unsigned int issue_flags) ret = do_tee(in, out, sp->len, flags); if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) - io_put_file(in); + fput(in); done: if (ret != sp->len) req_set_fail(req); @@ -94,8 +94,7 @@ int io_splice(struct io_kiocb *req, unsigned int issue_flags) struct file *in; long ret = 0; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); if (sp->flags & SPLICE_F_FD_IN_FIXED) in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); @@ -113,7 +112,7 @@ int io_splice(struct io_kiocb *req, unsigned int issue_flags) ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) - io_put_file(in); + fput(in); done: if (ret != sp->len) req_set_fail(req); diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index a7968565f9e7..bd6c2c7959a5 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -255,9 +255,13 @@ static int io_sq_thread(void *data) sqt_spin = true; if (sqt_spin || !time_after(jiffies, timeout)) { - cond_resched(); if (sqt_spin) timeout = jiffies + sqd->sq_thread_idle; + if (unlikely(need_resched())) { + mutex_unlock(&sqd->lock); + cond_resched(); + mutex_lock(&sqd->lock); + } continue; } @@ -311,7 +315,7 @@ static int io_sq_thread(void *data) do_exit(0); } -int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) +void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) { DEFINE_WAIT(wait); @@ -326,7 +330,6 @@ int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) } while (!signal_pending(current)); finish_wait(&ctx->sqo_sq_wait, &wait); - return 0; } __cold int io_sq_offload_create(struct io_ring_ctx *ctx, @@ -418,3 +421,20 @@ err: io_sq_thread_finish(ctx); return ret; } + +__cold int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, + cpumask_var_t mask) +{ + struct io_sq_data *sqd = ctx->sq_data; + int ret = -EINVAL; + + if (sqd) { + io_sq_thread_park(sqd); + /* Don't set affinity for a dying thread */ + if (sqd->thread) + ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask); + io_sq_thread_unpark(sqd); + } + + return ret; +} diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h index 0c3fbcd1f583..8df37e8c9149 100644 --- a/io_uring/sqpoll.h +++ b/io_uring/sqpoll.h @@ -26,4 +26,5 @@ void io_sq_thread_stop(struct io_sq_data *sqd); void io_sq_thread_park(struct io_sq_data *sqd); void io_sq_thread_unpark(struct io_sq_data *sqd); void io_put_sq_data(struct io_sq_data *sqd); -int io_sqpoll_wait_sq(struct io_ring_ctx *ctx); +void io_sqpoll_wait_sq(struct io_ring_ctx *ctx); +int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, cpumask_var_t mask); diff --git a/io_uring/statx.c b/io_uring/statx.c index d8fc933d3f59..abb874209caa 100644 --- a/io_uring/statx.c +++ b/io_uring/statx.c @@ -48,6 +48,7 @@ int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -56,8 +57,7 @@ int io_statx(struct io_kiocb *req, unsigned int issue_flags) struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer); io_req_set_res(req, ret, 0); diff --git a/io_uring/sync.c b/io_uring/sync.c index 64e87ea2b8fb..255f68c37e55 100644 --- a/io_uring/sync.c +++ b/io_uring/sync.c @@ -32,6 +32,8 @@ int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sync->off = READ_ONCE(sqe->off); sync->len = READ_ONCE(sqe->len); sync->flags = READ_ONCE(sqe->sync_range_flags); + req->flags |= REQ_F_FORCE_ASYNC; + return 0; } @@ -41,8 +43,7 @@ int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) int ret; /* sync_file_range always requires a blocking context */ - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = sync_file_range(req->file, sync->off, sync->len, sync->flags); io_req_set_res(req, ret, 0); @@ -62,6 +63,7 @@ int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sync->off = READ_ONCE(sqe->off); sync->len = READ_ONCE(sqe->len); + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -72,8 +74,7 @@ int io_fsync(struct io_kiocb *req, unsigned int issue_flags) int ret; /* fsync always requires a blocking context */ - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX, sync->flags & IORING_FSYNC_DATASYNC); @@ -91,6 +92,7 @@ int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sync->off = READ_ONCE(sqe->off); sync->len = READ_ONCE(sqe->addr); sync->mode = READ_ONCE(sqe->len); + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -100,8 +102,8 @@ int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) int ret; /* fallocate always requiring blocking context */ - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); + ret = vfs_fallocate(req->file, sync->mode, sync->off, sync->len); if (ret >= 0) fsnotify_modify(req->file); diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 4324b1cf1f6a..c043fe93a3f2 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -83,7 +83,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, xa_init(&tctx->xa); init_waitqueue_head(&tctx->wait); - atomic_set(&tctx->in_idle, 0); + atomic_set(&tctx->in_cancel, 0); atomic_set(&tctx->inflight_tracked, 0); task->io_uring = tctx; init_llist_head(&tctx->task_list); @@ -208,29 +208,38 @@ void io_uring_unreg_ringfd(void) } } +int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, + int start, int end) +{ + int offset; + for (offset = start; offset < end; offset++) { + offset = array_index_nospec(offset, IO_RINGFD_REG_MAX); + if (tctx->registered_rings[offset]) + continue; + + tctx->registered_rings[offset] = file; + return offset; + } + return -EBUSY; +} + static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, int start, int end) { struct file *file; int offset; - for (offset = start; offset < end; offset++) { - offset = array_index_nospec(offset, IO_RINGFD_REG_MAX); - if (tctx->registered_rings[offset]) - continue; - - file = fget(fd); - if (!file) { - return -EBADF; - } else if (!io_is_uring_fops(file)) { - fput(file); - return -EOPNOTSUPP; - } - tctx->registered_rings[offset] = file; - return offset; + file = fget(fd); + if (!file) { + return -EBADF; + } else if (!io_is_uring_fops(file)) { + fput(file); + return -EOPNOTSUPP; } - - return -EBUSY; + offset = io_ring_add_registered_file(tctx, file, start, end); + if (offset < 0) + fput(file); + return offset; } /* diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 826a51bca3e4..7fd7dbb211d6 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -17,6 +17,7 @@ struct io_timeout { struct file *file; u32 off; u32 target_seq; + u32 repeats; struct list_head list; /* head of the link, used by linked timeouts only */ struct io_kiocb *head; @@ -37,8 +38,9 @@ struct io_timeout_rem { static inline bool io_is_timeout_noseq(struct io_kiocb *req) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); + struct io_timeout_data *data = req->async_data; - return !timeout->off; + return !timeout->off || data->flags & IORING_TIMEOUT_MULTISHOT; } static inline void io_put_req(struct io_kiocb *req) @@ -49,6 +51,44 @@ static inline void io_put_req(struct io_kiocb *req) } } +static inline bool io_timeout_finish(struct io_timeout *timeout, + struct io_timeout_data *data) +{ + if (!(data->flags & IORING_TIMEOUT_MULTISHOT)) + return true; + + if (!timeout->off || (timeout->repeats && --timeout->repeats)) + return false; + + return true; +} + +static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer); + +static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) +{ + struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); + struct io_timeout_data *data = req->async_data; + struct io_ring_ctx *ctx = req->ctx; + + if (!io_timeout_finish(timeout, data)) { + bool filled; + filled = io_fill_cqe_req_aux(req, ts->locked, -ETIME, + IORING_CQE_F_MORE); + if (filled) { + /* re-arm timer */ + spin_lock_irq(&ctx->timeout_lock); + list_add(&timeout->list, ctx->timeout_list.prev); + data->timer.function = io_timeout_fn; + hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); + spin_unlock_irq(&ctx->timeout_lock); + return; + } + } + + io_req_task_complete(req, ts); +} + static bool io_kill_timeout(struct io_kiocb *req, int status) __must_hold(&req->ctx->timeout_lock) { @@ -101,9 +141,9 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) spin_unlock_irq(&ctx->timeout_lock); } -static void io_req_tw_fail_links(struct io_kiocb *link, bool *locked) +static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) { - io_tw_lock(link->ctx, locked); + io_tw_lock(link->ctx, ts); while (link) { struct io_kiocb *nxt = link->link; long res = -ECANCELED; @@ -112,7 +152,7 @@ static void io_req_tw_fail_links(struct io_kiocb *link, bool *locked) res = link->cqe.res; link->link = NULL; io_req_set_res(link, res, 0); - io_req_task_complete(link, locked); + io_req_task_complete(link, ts); link = nxt; } } @@ -212,7 +252,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) req_set_fail(req); io_req_set_res(req, -ETIME, 0); - req->io_task_work.func = io_req_task_complete; + req->io_task_work.func = io_timeout_complete; io_req_task_work_add(req); return HRTIMER_NORESTART; } @@ -228,16 +268,10 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, list_for_each_entry(timeout, &ctx->timeout_list, list) { struct io_kiocb *tmp = cmd_to_io_kiocb(timeout); - if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && - cd->data != tmp->cqe.user_data) - continue; - if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { - if (cd->seq == tmp->work.cancel_seq) - continue; - tmp->work.cancel_seq = cd->seq; + if (io_cancel_req_match(tmp, cd)) { + req = tmp; + break; } - req = tmp; - break; } if (!req) return ERR_PTR(-ENOENT); @@ -265,9 +299,9 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) return 0; } -static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) +static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *ts) { - unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED; + unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_kiocb *prev = timeout->prev; int ret = -ENOENT; @@ -282,11 +316,11 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) ret = io_try_cancel(req->task->io_uring, &cd, issue_flags); } io_req_set_res(req, ret ?: -ETIME, 0); - io_req_task_complete(req, locked); + io_req_task_complete(req, ts); io_put_req(prev); } else { io_req_set_res(req, -ETIME, 0); - io_req_task_complete(req, locked); + io_req_task_complete(req, ts); } } @@ -369,7 +403,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, struct timespec64 *ts, enum hrtimer_mode mode) __must_hold(&ctx->timeout_lock) { - struct io_cancel_data cd = { .data = user_data, }; + struct io_cancel_data cd = { .ctx = ctx, .data = user_data, }; struct io_kiocb *req = io_timeout_extract(ctx, &cd); struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_timeout_data *data; @@ -433,7 +467,7 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) int ret; if (!(tr->flags & IORING_TIMEOUT_UPDATE)) { - struct io_cancel_data cd = { .data = tr->addr, }; + struct io_cancel_data cd = { .ctx = ctx, .data = tr->addr, }; spin_lock(&ctx->completion_lock); ret = io_timeout_cancel(ctx, &cd); @@ -470,16 +504,27 @@ static int __io_timeout_prep(struct io_kiocb *req, return -EINVAL; flags = READ_ONCE(sqe->timeout_flags); if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK | - IORING_TIMEOUT_ETIME_SUCCESS)) + IORING_TIMEOUT_ETIME_SUCCESS | + IORING_TIMEOUT_MULTISHOT)) return -EINVAL; /* more than one clock specified is invalid, obviously */ if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) return -EINVAL; + /* multishot requests only make sense with rel values */ + if (!(~flags & (IORING_TIMEOUT_MULTISHOT | IORING_TIMEOUT_ABS))) + return -EINVAL; INIT_LIST_HEAD(&timeout->list); timeout->off = off; if (unlikely(off && !req->ctx->off_timeout_used)) req->ctx->off_timeout_used = true; + /* + * for multishot reqs w/ fixed nr of repeats, repeats tracks the + * remaining nr + */ + timeout->repeats = 0; + if ((flags & IORING_TIMEOUT_MULTISHOT) && off > 0) + timeout->repeats = off; if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; @@ -543,7 +588,7 @@ int io_timeout(struct io_kiocb *req, unsigned int issue_flags) goto add; } - tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); + tail = data_race(ctx->cached_cq_tail) - atomic_read(&ctx->cq_timeouts); timeout->target_seq = tail + off; /* Update the last seq here in case io_flush_timeouts() hasn't. diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 9a1dee571872..537795fddc87 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -7,36 +7,44 @@ #include #include +#include #include "io_uring.h" #include "rsrc.h" #include "uring_cmd.h" -static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) +static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED; + unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; ioucmd->task_work_cb(ioucmd, issue_flags); } -void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned), + unsigned flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); ioucmd->task_work_cb = task_work_cb; req->io_task_work.func = io_uring_cmd_work; - io_req_task_work_add(req); + __io_req_task_work_add(req, flags); } -EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task); +EXPORT_SYMBOL_GPL(__io_uring_cmd_do_in_task); + +void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +{ + __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_do_in_task_lazy); static inline void io_req_set_cqe32_extra(struct io_kiocb *req, u64 extra1, u64 extra2) { - req->extra1 = extra1; - req->extra2 = extra2; - req->flags |= REQ_F_CQE32_INIT; + req->big_cqe.extra1 = extra1; + req->big_cqe.extra2 = extra2; } /* @@ -54,25 +62,24 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2, io_req_set_res(req, ret, 0); if (req->ctx->flags & IORING_SETUP_CQE32) io_req_set_cqe32_extra(req, res2, 0); - if (req->ctx->flags & IORING_SETUP_IOPOLL) + if (req->ctx->flags & IORING_SETUP_IOPOLL) { /* order with io_iopoll_req_issued() checking ->iopoll_complete */ smp_store_release(&req->iopoll_completed, 1); - else - io_req_complete_post(req, issue_flags); + } else { + struct io_tw_state ts = { + .locked = !(issue_flags & IO_URING_F_UNLOCKED), + }; + io_req_task_complete(req, &ts); + } } EXPORT_SYMBOL_GPL(io_uring_cmd_done); int io_uring_cmd_prep_async(struct io_kiocb *req) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - size_t cmd_size; - BUILD_BUG_ON(uring_cmd_pdu_size(0) != 16); - BUILD_BUG_ON(uring_cmd_pdu_size(1) != 80); - - cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128); - - memcpy(req->async_data, ioucmd->cmd, cmd_size); + memcpy(req->async_data, ioucmd->sqe, uring_sqe_size(req->ctx)); + ioucmd->sqe = req->async_data; return 0; } @@ -98,7 +105,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->imu = ctx->user_bufs[index]; io_req_set_rsrc_node(req, ctx, 0); } - ioucmd->cmd = sqe->cmd; + ioucmd->sqe = sqe; ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); return 0; } @@ -129,9 +136,6 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) WRITE_ONCE(ioucmd->cookie, NULL); } - if (req_has_async_data(req)) - ioucmd->cmd = req->async_data; - ret = file->f_op->uring_cmd(ioucmd, issue_flags); if (ret == -EAGAIN) { if (!req_has_async_data(req)) { @@ -160,3 +164,30 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, return io_import_fixed(rw, iter, req->imu, ubuf, len); } EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed); + +int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct socket *sock = cmd->file->private_data; + struct sock *sk = sock->sk; + struct proto *prot = READ_ONCE(sk->sk_prot); + int ret, arg = 0; + + if (!prot || !prot->ioctl) + return -EOPNOTSUPP; + + switch (cmd->sqe->cmd_op) { + case SOCKET_URING_OP_SIOCINQ: + ret = prot->ioctl(sk, SIOCINQ, &arg); + if (ret) + return ret; + return arg; + case SOCKET_URING_OP_SIOCOUTQ: + ret = prot->ioctl(sk, SIOCOUTQ, &arg); + if (ret) + return ret; + return arg; + default: + return -EOPNOTSUPP; + } +} +EXPORT_SYMBOL_GPL(io_uring_cmd_sock); diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index 7c6697d13cb2..8117684ec3ca 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -3,11 +3,3 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_uring_cmd_prep_async(struct io_kiocb *req); - -/* - * The URING_CMD payload starts at 'cmd' in the first sqe, and continues into - * the following sqe if SQE128 is used. - */ -#define uring_cmd_pdu_size(is_sqe128) \ - ((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) - \ - offsetof(struct io_uring_sqe, cmd)) diff --git a/io_uring/xattr.c b/io_uring/xattr.c index 99df641594d7..755fd556a8a2 100644 --- a/io_uring/xattr.c +++ b/io_uring/xattr.c @@ -75,6 +75,7 @@ static int __io_getxattr_prep(struct io_kiocb *req, } req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -109,8 +110,7 @@ int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr); int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt), req->file->f_path.dentry, @@ -127,8 +127,7 @@ int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) struct path path; int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); retry: ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); @@ -176,6 +175,7 @@ static int __io_setxattr_prep(struct io_kiocb *req, } req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -224,8 +224,7 @@ int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags) { int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = __io_setxattr(req, issue_flags, &req->file->f_path); io_xattr_finish(req, ret); @@ -239,8 +238,7 @@ int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) struct path path; int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; + WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); retry: ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index c3ca28ca68a6..5df03dedfc01 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -126,13 +126,13 @@ __out: \ iterate_buf(i, n, base, len, off, \ i->ubuf, (I)) \ } else if (likely(iter_is_iovec(i))) { \ - const struct iovec *iov = i->iov; \ + const struct iovec *iov = iter_iov(i); \ void __user *base; \ size_t len; \ iterate_iovec(i, n, base, len, off, \ iov, (I)) \ - i->nr_segs -= iov - i->iov; \ - i->iov = iov; \ + i->nr_segs -= iov - iter_iov(i); \ + i->__iov = iov; \ } else if (iov_iter_is_bvec(i)) { \ const struct bio_vec *bvec = i->bvec; \ void *base; \ @@ -361,7 +361,7 @@ size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size) size_t skip; size -= count; - for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) { + for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { size_t len = min(count, p->iov_len - skip); size_t ret; @@ -404,7 +404,7 @@ size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size) size_t skip; size -= count; - for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) { + for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { size_t len = min(count, p->iov_len - skip); size_t ret; @@ -431,7 +431,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction, .nofault = false, .user_backed = true, .data_source = direction, - .iov = iov, + .__iov = iov, .nr_segs = nr_segs, .iov_offset = 0, .count = count @@ -881,14 +881,14 @@ static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) i->count -= size; size += i->iov_offset; // from beginning of current segment - for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) { + for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) { if (likely(size < iov->iov_len)) break; size -= iov->iov_len; } i->iov_offset = size; - i->nr_segs -= iov - i->iov; - i->iov = iov; + i->nr_segs -= iov - iter_iov(i); + i->__iov = iov; } void iov_iter_advance(struct iov_iter *i, size_t size) @@ -963,12 +963,12 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll) unroll -= n; } } else { /* same logics for iovec and kvec */ - const struct iovec *iov = i->iov; + const struct iovec *iov = iter_iov(i); while (1) { size_t n = (--iov)->iov_len; i->nr_segs++; if (unroll <= n) { - i->iov = iov; + i->__iov = iov; i->iov_offset = n - unroll; return; } @@ -985,7 +985,7 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i) { if (i->nr_segs > 1) { if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) - return min(i->count, i->iov->iov_len - i->iov_offset); + return min(i->count, iter_iov(i)->iov_len - i->iov_offset); if (iov_iter_is_bvec(i)) return min(i->count, i->bvec->bv_len - i->iov_offset); } @@ -1100,13 +1100,14 @@ static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, unsigned k; for (k = 0; k < i->nr_segs; k++, skip = 0) { - size_t len = i->iov[k].iov_len - skip; + const struct iovec *iov = iter_iov(i) + k; + size_t len = iov->iov_len - skip; if (len > size) len = size; if (len & len_mask) return false; - if ((unsigned long)(i->iov[k].iov_base + skip) & addr_mask) + if ((unsigned long)(iov->iov_base + skip) & addr_mask) return false; size -= len; @@ -1199,9 +1200,10 @@ static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) unsigned k; for (k = 0; k < i->nr_segs; k++, skip = 0) { - size_t len = i->iov[k].iov_len - skip; + const struct iovec *iov = iter_iov(i) + k; + size_t len = iov->iov_len - skip; if (len) { - res |= (unsigned long)i->iov[k].iov_base + skip; + res |= (unsigned long)iov->iov_base + skip; if (len > size) len = size; res |= len; @@ -1278,14 +1280,15 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i) return ~0U; for (k = 0; k < i->nr_segs; k++) { - if (i->iov[k].iov_len) { - unsigned long base = (unsigned long)i->iov[k].iov_base; + const struct iovec *iov = iter_iov(i) + k; + if (iov->iov_len) { + unsigned long base = (unsigned long)iov->iov_base; if (v) // if not the first one res |= base | v; // this start | previous end - v = base + i->iov[k].iov_len; - if (size <= i->iov[k].iov_len) + v = base + iov->iov_len; + if (size <= iov->iov_len) break; - size -= i->iov[k].iov_len; + size -= iov->iov_len; } } return res; @@ -1401,13 +1404,14 @@ static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size) return (unsigned long)i->ubuf + i->iov_offset; for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { - size_t len = i->iov[k].iov_len - skip; + const struct iovec *iov = iter_iov(i) + k; + size_t len = iov->iov_len - skip; if (unlikely(!len)) continue; if (*size > len) *size = len; - return (unsigned long)i->iov[k].iov_base + skip; + return (unsigned long)iov->iov_base + skip; } BUG(); // if it had been empty, we wouldn't get called } @@ -1596,7 +1600,7 @@ static int iov_npages(const struct iov_iter *i, int maxpages) const struct iovec *p; int npages = 0; - for (p = i->iov; size; skip = 0, p++) { + for (p = iter_iov(i); size; skip = 0, p++) { unsigned offs = offset_in_page(p->iov_base + skip); size_t len = min(p->iov_len - skip, size); @@ -1673,7 +1677,7 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) flags); else if (iov_iter_is_kvec(new) || iter_is_iovec(new)) /* iovec and kvec have identical layout */ - return new->iov = kmemdup(new->iov, + return new->__iov = kmemdup(new->__iov, new->nr_segs * sizeof(struct iovec), flags); return NULL; @@ -1855,6 +1859,17 @@ int import_single_range(int rw, void __user *buf, size_t len, } EXPORT_SYMBOL(import_single_range); +int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i) +{ + if (len > MAX_RW_COUNT) + len = MAX_RW_COUNT; + if (unlikely(!access_ok(buf, len))) + return -EFAULT; + + iov_iter_ubuf(i, rw, buf, len); + return 0; +} + /** * iov_iter_restore() - Restore a &struct iov_iter to the same state as when * iov_iter_save_state() was called. @@ -1869,8 +1884,8 @@ EXPORT_SYMBOL(import_single_range); */ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) { - if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) && - !iov_iter_is_kvec(i) && !iter_is_ubuf(i)) + if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) && + !iter_is_ubuf(i)) && !iov_iter_is_kvec(i)) return; i->iov_offset = state->iov_offset; i->count = state->count; @@ -1889,6 +1904,6 @@ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) if (iov_iter_is_bvec(i)) i->bvec -= state->nr_segs - i->nr_segs; else - i->iov -= state->nr_segs - i->nr_segs; + i->__iov -= state->nr_segs - i->nr_segs; i->nr_segs = state->nr_segs; } diff --git a/mm/madvise.c b/mm/madvise.c index 900371ccf69d..64b4c40a788a 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1478,7 +1478,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, size_t, vlen, int, behavior, unsigned int, flags) { ssize_t ret; - struct iovec iovstack[UIO_FASTIOV], iovec; + struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; struct task_struct *task; @@ -1525,12 +1525,11 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, total_len = iov_iter_count(&iter); while (iov_iter_count(&iter)) { - iovec = iov_iter_iovec(&iter); - ret = do_madvise(mm, (unsigned long)iovec.iov_base, - iovec.iov_len, behavior); + ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter), + iter_iov_len(&iter), behavior); if (ret < 0) break; - iov_iter_advance(&iter, iovec.iov_len); + iov_iter_advance(&iter, iter_iov_len(&iter)); } ret = (total_len - iov_iter_count(&iter)) ? : ret; diff --git a/mm/nommu.c b/mm/nommu.c index e819cbc21b39..15652e25d958 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -975,9 +975,10 @@ static int do_mmap_private(struct vm_area_struct *vma, */ if (capabilities & NOMMU_MAP_DIRECT) { ret = call_mmap(vma->vm_file, vma); + /* shouldn't return success if we're not sharing */ + if (WARN_ON_ONCE(!is_nommu_shared_mapping(vma->vm_flags))) + ret = -ENOSYS; if (ret == 0) { - /* shouldn't return success if we're not sharing */ - BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); vma->vm_region->vm_top = vma->vm_region->vm_end; return 0; } @@ -1118,7 +1119,7 @@ unsigned long do_mmap(struct file *file, * these cases, sharing is handled in the driver or filesystem rather * than here */ - if (vm_flags & VM_MAYSHARE) { + if (is_nommu_shared_mapping(vm_flags)) { struct vm_region *pregion; unsigned long pglen, rpglen, pgend, rpgend, start; @@ -1128,7 +1129,7 @@ unsigned long do_mmap(struct file *file, for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) { pregion = rb_entry(rb, struct vm_region, vm_rb); - if (!(pregion->vm_flags & VM_MAYSHARE)) + if (!is_nommu_shared_mapping(pregion->vm_flags)) continue; /* search for overlapping mappings on the same file */ @@ -1575,7 +1576,7 @@ static unsigned long do_mremap(unsigned long addr, if (vma->vm_end != vma->vm_start + old_len) return (unsigned long) -EFAULT; - if (vma->vm_flags & VM_MAYSHARE) + if (is_nommu_shared_mapping(vma->vm_flags)) return (unsigned long) -EPERM; if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start) diff --git a/net/core/sock.c b/net/core/sock.c index 49a5f353216a..73f58ec58487 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -114,6 +114,9 @@ #include #include #include +#include +#include +#include #include @@ -138,6 +141,7 @@ #include #include +#include #include @@ -4028,3 +4032,63 @@ int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) return sk->sk_prot->bind_add(sk, addr, addr_len); } EXPORT_SYMBOL(sock_bind_add); + +/* Copy 'size' bytes from userspace and return `size` back to userspace */ +int sock_ioctl_inout(struct sock *sk, unsigned int cmd, + void __user *arg, void *karg, size_t size) +{ + int ret; + + if (copy_from_user(karg, arg, size)) + return -EFAULT; + + ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg); + if (ret) + return ret; + + if (copy_to_user(arg, karg, size)) + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL(sock_ioctl_inout); + +/* This is the most common ioctl prep function, where the result (4 bytes) is + * copied back to userspace if the ioctl() returns successfully. No input is + * copied from userspace as input argument. + */ +static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg) +{ + int ret, karg = 0; + + ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg); + if (ret) + return ret; + + return put_user(karg, (int __user *)arg); +} + +/* A wrapper around sock ioctls, which copies the data from userspace + * (depending on the protocol/ioctl), and copies back the result to userspace. + * The main motivation for this function is to pass kernel memory to the + * protocol ioctl callbacks, instead of userspace memory. + */ +int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) +{ + int rc = 1; + + if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET) + rc = ipmr_sk_ioctl(sk, cmd, arg); + else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6) + rc = ip6mr_sk_ioctl(sk, cmd, arg); + else if (sk_is_phonet(sk)) + rc = phonet_sk_ioctl(sk, cmd, arg); + + /* If ioctl was processed, returns its value */ + if (rc <= 0) + return rc; + + /* Otherwise call the default handler */ + return sock_ioctl_out(sk, cmd, arg); +} +EXPORT_SYMBOL(sk_ioctl); diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 0218eb169891..220c6d8997f2 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -297,7 +297,7 @@ int dccp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int dccp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); -int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg); +int dccp_ioctl(struct sock *sk, int cmd, int *karg); int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 2af5a2828911..f35d169deb90 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -371,7 +371,7 @@ __poll_t dccp_poll(struct file *file, struct socket *sock, EXPORT_SYMBOL_GPL(dccp_poll); -int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) +int dccp_ioctl(struct sock *sk, int cmd, int *karg) { int rc = -ENOTCONN; @@ -382,17 +382,17 @@ int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) switch (cmd) { case SIOCOUTQ: { - int amount = sk_wmem_alloc_get(sk); + *karg = sk_wmem_alloc_get(sk); /* Using sk_wmem_alloc here because sk_wmem_queued is not used by DCCP and * always 0, comparably to UDP. */ - rc = put_user(amount, (int __user *)arg); + rc = 0; } break; case SIOCINQ: { struct sk_buff *skb; - unsigned long amount = 0; + *karg = 0; skb = skb_peek(&sk->sk_receive_queue); if (skb != NULL) { @@ -400,9 +400,9 @@ int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) * We will only return the amount of this packet since * that is all that will be read. */ - amount = skb->len; + *karg = skb->len; } - rc = put_user(amount, (int __user *)arg); + rc = 0; } break; default: diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index a725dd9bbda8..3ec8597ab162 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -162,7 +162,7 @@ static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd, default: if (!sk->sk_prot->ioctl) return -ENOIOCTLCMD; - return sk->sk_prot->ioctl(sk, cmd, arg); + return sk_ioctl(sk, cmd, (void __user *)arg); } } @@ -524,22 +524,21 @@ out: return err; } -static int dgram_ioctl(struct sock *sk, int cmd, unsigned long arg) +static int dgram_ioctl(struct sock *sk, int cmd, int *karg) { switch (cmd) { case SIOCOUTQ: { - int amount = sk_wmem_alloc_get(sk); + *karg = sk_wmem_alloc_get(sk); - return put_user(amount, (int __user *)arg); + return 0; } case SIOCINQ: { struct sk_buff *skb; - unsigned long amount; - amount = 0; + *karg = 0; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) { @@ -547,10 +546,10 @@ static int dgram_ioctl(struct sock *sk, int cmd, unsigned long arg) * of this packet since that is all * that will be read. */ - amount = skb->len - ieee802154_hdr_length(skb); + *karg = skb->len - ieee802154_hdr_length(skb); } spin_unlock_bh(&sk->sk_receive_queue.lock); - return put_user(amount, (int __user *)arg); + return 0; } } diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index c64caabfb02a..9ec75571c568 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1004,7 +1004,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) break; default: if (sk->sk_prot->ioctl) - err = sk->sk_prot->ioctl(sk, cmd, arg); + err = sk_ioctl(sk, cmd, (void __user *)arg); else err = -ENOIOCTLCMD; break; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index ad81f5f65a1e..579b60a3c8d7 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1540,6 +1540,28 @@ out: return ret; } +/* Execute if this ioctl is a special mroute ioctl */ +int ipmr_sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) +{ + switch (cmd) { + /* These userspace buffers will be consumed by ipmr_ioctl() */ + case SIOCGETVIFCNT: { + struct sioc_vif_req buffer; + + return sock_ioctl_inout(sk, cmd, arg, &buffer, + sizeof(buffer)); + } + case SIOCGETSGCNT: { + struct sioc_sg_req buffer; + + return sock_ioctl_inout(sk, cmd, arg, &buffer, + sizeof(buffer)); + } + } + /* return code > 0 means that the ioctl was not executed */ + return 1; +} + /* Getsock opt support for the multicast routing system. */ int ip_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, sockptr_t optlen) @@ -1586,13 +1608,13 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, } /* The IP multicast ioctl support routines. */ -int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) +int ipmr_ioctl(struct sock *sk, int cmd, void *arg) { - struct sioc_sg_req sr; - struct sioc_vif_req vr; struct vif_device *vif; struct mfc_cache *c; struct net *net = sock_net(sk); + struct sioc_vif_req *vr; + struct sioc_sg_req *sr; struct mr_table *mrt; mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); @@ -1601,40 +1623,33 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) switch (cmd) { case SIOCGETVIFCNT: - if (copy_from_user(&vr, arg, sizeof(vr))) - return -EFAULT; - if (vr.vifi >= mrt->maxvif) + vr = (struct sioc_vif_req *)arg; + if (vr->vifi >= mrt->maxvif) return -EINVAL; - vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif); + vr->vifi = array_index_nospec(vr->vifi, mrt->maxvif); read_lock(&mrt_lock); - vif = &mrt->vif_table[vr.vifi]; - if (VIF_EXISTS(mrt, vr.vifi)) { - vr.icount = vif->pkt_in; - vr.ocount = vif->pkt_out; - vr.ibytes = vif->bytes_in; - vr.obytes = vif->bytes_out; + vif = &mrt->vif_table[vr->vifi]; + if (VIF_EXISTS(mrt, vr->vifi)) { + vr->icount = vif->pkt_in; + vr->ocount = vif->pkt_out; + vr->ibytes = vif->bytes_in; + vr->obytes = vif->bytes_out; read_unlock(&mrt_lock); - if (copy_to_user(arg, &vr, sizeof(vr))) - return -EFAULT; return 0; } read_unlock(&mrt_lock); return -EADDRNOTAVAIL; case SIOCGETSGCNT: - if (copy_from_user(&sr, arg, sizeof(sr))) - return -EFAULT; + sr = (struct sioc_sg_req *)arg; rcu_read_lock(); - c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); + c = ipmr_cache_find(mrt, sr->src.s_addr, sr->grp.s_addr); if (c) { - sr.pktcnt = c->_c.mfc_un.res.pkt; - sr.bytecnt = c->_c.mfc_un.res.bytes; - sr.wrong_if = c->_c.mfc_un.res.wrong_if; + sr->pktcnt = c->_c.mfc_un.res.pkt; + sr->bytecnt = c->_c.mfc_un.res.bytes; + sr->wrong_if = c->_c.mfc_un.res.wrong_if; rcu_read_unlock(); - - if (copy_to_user(arg, &sr, sizeof(sr))) - return -EFAULT; return 0; } rcu_read_unlock(); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 31b1f9a1f482..e2d5e21a06fa 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -855,29 +855,29 @@ static int raw_getsockopt(struct sock *sk, int level, int optname, return do_raw_getsockopt(sk, level, optname, optval, optlen); } -static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg) +static int raw_ioctl(struct sock *sk, int cmd, int *karg) { switch (cmd) { case SIOCOUTQ: { - int amount = sk_wmem_alloc_get(sk); - - return put_user(amount, (int __user *)arg); + *karg = sk_wmem_alloc_get(sk); + return 0; } case SIOCINQ: { struct sk_buff *skb; - int amount = 0; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) - amount = skb->len; + *karg = skb->len; + else + *karg = 0; spin_unlock_bh(&sk->sk_receive_queue.lock); - return put_user(amount, (int __user *)arg); + return 0; } default: #ifdef CONFIG_IP_MROUTE - return ipmr_ioctl(sk, cmd, (void __user *)arg); + return ipmr_ioctl(sk, cmd, karg); #else return -ENOIOCTLCMD; #endif diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5274e46e7d56..5d3202a73261 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -596,7 +596,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) } EXPORT_SYMBOL(tcp_poll); -int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) +int tcp_ioctl(struct sock *sk, int cmd, int *karg) { struct tcp_sock *tp = tcp_sk(sk); int answ; @@ -638,7 +638,8 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) return -ENOIOCTLCMD; } - return put_user(answ, (int __user *)arg); + *karg = answ; + return 0; } EXPORT_SYMBOL(tcp_ioctl); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4420e5bc399a..bbf80fbfa793 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1717,21 +1717,19 @@ static int first_packet_length(struct sock *sk) * IOCTL requests applicable to the UDP protocol */ -int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) +int udp_ioctl(struct sock *sk, int cmd, int *karg) { switch (cmd) { case SIOCOUTQ: { - int amount = sk_wmem_alloc_get(sk); - - return put_user(amount, (int __user *)arg); + *karg = sk_wmem_alloc_get(sk); + return 0; } case SIOCINQ: { - int amount = max_t(int, 0, first_packet_length(sk)); - - return put_user(amount, (int __user *)arg); + *karg = max_t(int, 0, first_packet_length(sk)); + return 0; } default: diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 3fe43345a99d..ae411ec9e1f3 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -594,7 +594,7 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) prot = READ_ONCE(sk->sk_prot); if (!prot->ioctl) return -ENOIOCTLCMD; - return prot->ioctl(sk, cmd, arg); + return sk_ioctl(sk, cmd, (void __user *)arg); } /*NOTREACHED*/ return 0; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index a19c00565b85..59edd545a7ff 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1853,11 +1853,10 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, /* * The IP multicast ioctl support routines. */ - -int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) +int ip6mr_ioctl(struct sock *sk, int cmd, void *arg) { - struct sioc_sg_req6 sr; - struct sioc_mif_req6 vr; + struct sioc_sg_req6 *sr; + struct sioc_mif_req6 *vr; struct vif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); @@ -1869,40 +1868,33 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) switch (cmd) { case SIOCGETMIFCNT_IN6: - if (copy_from_user(&vr, arg, sizeof(vr))) - return -EFAULT; - if (vr.mifi >= mrt->maxvif) + vr = (struct sioc_mif_req6 *)arg; + if (vr->mifi >= mrt->maxvif) return -EINVAL; - vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif); + vr->mifi = array_index_nospec(vr->mifi, mrt->maxvif); read_lock(&mrt_lock); - vif = &mrt->vif_table[vr.mifi]; - if (VIF_EXISTS(mrt, vr.mifi)) { - vr.icount = vif->pkt_in; - vr.ocount = vif->pkt_out; - vr.ibytes = vif->bytes_in; - vr.obytes = vif->bytes_out; + vif = &mrt->vif_table[vr->mifi]; + if (VIF_EXISTS(mrt, vr->mifi)) { + vr->icount = vif->pkt_in; + vr->ocount = vif->pkt_out; + vr->ibytes = vif->bytes_in; + vr->obytes = vif->bytes_out; read_unlock(&mrt_lock); - - if (copy_to_user(arg, &vr, sizeof(vr))) - return -EFAULT; return 0; } read_unlock(&mrt_lock); return -EADDRNOTAVAIL; case SIOCGETSGCNT_IN6: - if (copy_from_user(&sr, arg, sizeof(sr))) - return -EFAULT; + sr = (struct sioc_sg_req6 *)arg; rcu_read_lock(); - c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); + c = ip6mr_cache_find(mrt, &sr->src.sin6_addr, + &sr->grp.sin6_addr); if (c) { - sr.pktcnt = c->_c.mfc_un.res.pkt; - sr.bytecnt = c->_c.mfc_un.res.bytes; - sr.wrong_if = c->_c.mfc_un.res.wrong_if; + sr->pktcnt = c->_c.mfc_un.res.pkt; + sr->bytecnt = c->_c.mfc_un.res.bytes; + sr->wrong_if = c->_c.mfc_un.res.wrong_if; rcu_read_unlock(); - - if (copy_to_user(arg, &sr, sizeof(sr))) - return -EFAULT; return 0; } rcu_read_unlock(); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 7b48a477444b..0e4c38c32b34 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1116,29 +1116,29 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname, return do_rawv6_getsockopt(sk, level, optname, optval, optlen); } -static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg) +static int rawv6_ioctl(struct sock *sk, int cmd, int *karg) { switch (cmd) { case SIOCOUTQ: { - int amount = sk_wmem_alloc_get(sk); - - return put_user(amount, (int __user *)arg); + *karg = sk_wmem_alloc_get(sk); + return 0; } case SIOCINQ: { struct sk_buff *skb; - int amount = 0; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) - amount = skb->len; + *karg = skb->len; + else + *karg = 0; spin_unlock_bh(&sk->sk_receive_queue.lock); - return put_user(amount, (int __user *)arg); + return 0; } default: #ifdef CONFIG_IPV6_MROUTE - return ip6mr_ioctl(sk, cmd, (void __user *)arg); + return ip6mr_ioctl(sk, cmd, karg); #else return -ENOIOCTLCMD; #endif diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 98ea98eb9567..3a3c2fa308a7 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -272,7 +272,7 @@ int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type); /* IOCTL helper for IP encap modules. */ -int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg); +int l2tp_ioctl(struct sock *sk, int cmd, int *karg); /* Extract the tunnel structure from a socket's sk_user_data pointer, * validating the tunnel magic feather. diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 8df7fe3a0613..f5dd1159d3d6 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -563,19 +563,18 @@ out: return err ? err : copied; } -int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg) +int l2tp_ioctl(struct sock *sk, int cmd, int *karg) { struct sk_buff *skb; - int amount; switch (cmd) { case SIOCOUTQ: - amount = sk_wmem_alloc_get(sk); + *karg = sk_wmem_alloc_get(sk); break; case SIOCINQ: spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); - amount = skb ? skb->len : 0; + *karg = skb ? skb->len : 0; spin_unlock_bh(&sk->sk_receive_queue.lock); break; @@ -583,7 +582,7 @@ int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg) return -ENOIOCTLCMD; } - return put_user(amount, (int __user *)arg); + return 0; } EXPORT_SYMBOL_GPL(l2tp_ioctl); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index fde25786619d..2f108728f4f2 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3620,11 +3620,10 @@ static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v) return (int)delta; } -static int mptcp_ioctl(struct sock *sk, int cmd, unsigned long arg) +static int mptcp_ioctl(struct sock *sk, int cmd, int *karg) { struct mptcp_sock *msk = mptcp_sk(sk); bool slow; - int answ; switch (cmd) { case SIOCINQ: @@ -3633,24 +3632,24 @@ static int mptcp_ioctl(struct sock *sk, int cmd, unsigned long arg) lock_sock(sk); __mptcp_move_skbs(msk); - answ = mptcp_inq_hint(sk); + *karg = mptcp_inq_hint(sk); release_sock(sk); break; case SIOCOUTQ: slow = lock_sock_fast(sk); - answ = mptcp_ioctl_outq(msk, READ_ONCE(msk->snd_una)); + *karg = mptcp_ioctl_outq(msk, READ_ONCE(msk->snd_una)); unlock_sock_fast(sk, slow); break; case SIOCOUTQNSD: slow = lock_sock_fast(sk); - answ = mptcp_ioctl_outq(msk, msk->snd_nxt); + *karg = mptcp_ioctl_outq(msk, msk->snd_nxt); unlock_sock_fast(sk, slow); break; default: return -ENOIOCTLCMD; } - return put_user(answ, (int __user *)arg); + return 0; } static void mptcp_subflow_early_fallback(struct mptcp_sock *msk, diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c index 3f2e62b63dd4..776012e80606 100644 --- a/net/phonet/datagram.c +++ b/net/phonet/datagram.c @@ -28,24 +28,21 @@ static void pn_sock_close(struct sock *sk, long timeout) sk_common_release(sk); } -static int pn_ioctl(struct sock *sk, int cmd, unsigned long arg) +static int pn_ioctl(struct sock *sk, int cmd, int *karg) { struct sk_buff *skb; - int answ; switch (cmd) { case SIOCINQ: lock_sock(sk); skb = skb_peek(&sk->sk_receive_queue); - answ = skb ? skb->len : 0; + *karg = skb ? skb->len : 0; release_sock(sk); - return put_user(answ, (int __user *)arg); + return 0; case SIOCPNADDRESOURCE: case SIOCPNDELRESOURCE: { - u32 res; - if (get_user(res, (u32 __user *)arg)) - return -EFAULT; + u32 res = *karg; if (res >= 256) return -EINVAL; if (cmd == SIOCPNADDRESOURCE) diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 8d73635b72b6..e86eb0e671f4 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -916,10 +916,9 @@ static int pep_sock_enable(struct sock *sk, struct sockaddr *addr, int len) return 0; } -static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg) +static int pep_ioctl(struct sock *sk, int cmd, int *karg) { struct pep_sock *pn = pep_sk(sk); - int answ; int ret = -ENOIOCTLCMD; switch (cmd) { @@ -932,13 +931,13 @@ static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg) lock_sock(sk); if (sock_flag(sk, SOCK_URGINLINE) && !skb_queue_empty(&pn->ctrlreq_queue)) - answ = skb_peek(&pn->ctrlreq_queue)->len; + *karg = skb_peek(&pn->ctrlreq_queue)->len; else if (!skb_queue_empty(&sk->sk_receive_queue)) - answ = skb_peek(&sk->sk_receive_queue)->len; + *karg = skb_peek(&sk->sk_receive_queue)->len; else - answ = 0; + *karg = 0; release_sock(sk); - ret = put_user(answ, (int __user *)arg); + ret = 0; break; case SIOCPNENABLEPIPE: diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 71e2caf6ab85..967f9b4dc026 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -387,7 +387,7 @@ static int pn_socket_ioctl(struct socket *sock, unsigned int cmd, return put_user(handle, (__u16 __user *)arg); } - return sk->sk_prot->ioctl(sk, cmd, arg); + return sk_ioctl(sk, cmd, (void __user *)arg); } static int pn_socket_listen(struct socket *sock, int backlog) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 07af385900de..0a817feef62f 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4896,7 +4896,7 @@ out: } /* The SCTP ioctl handler. */ -static int sctp_ioctl(struct sock *sk, int cmd, unsigned long arg) +static int sctp_ioctl(struct sock *sk, int cmd, int *karg) { int rc = -ENOTCONN; @@ -4912,7 +4912,7 @@ static int sctp_ioctl(struct sock *sk, int cmd, unsigned long arg) switch (cmd) { case SIOCINQ: { struct sk_buff *skb; - unsigned int amount = 0; + *karg = 0; skb = skb_peek(&sk->sk_receive_queue); if (skb != NULL) { @@ -4920,9 +4920,9 @@ static int sctp_ioctl(struct sock *sk, int cmd, unsigned long arg) * We will only return the amount of this packet since * that is all that will be read. */ - amount = skb->len; + *karg = skb->len; } - rc = put_user(amount, (int __user *)arg); + rc = 0; break; } default: diff --git a/net/socket.c b/net/socket.c index f81f5e03b0e5..edd3504e76a1 100644 --- a/net/socket.c +++ b/net/socket.c @@ -87,6 +87,7 @@ #include #include #include +#include #include #include @@ -158,6 +159,7 @@ static const struct file_operations socket_file_ops = { #ifdef CONFIG_COMPAT .compat_ioctl = compat_sock_ioctl, #endif + .uring_cmd = io_uring_cmd_sock, .mmap = sock_mmap, .release = sock_close, .fasync = sock_fasync, @@ -469,6 +471,7 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) return file; } + file->f_mode |= FMODE_NOWAIT; sock->file = file; file->private_data = sock; stream_open(SOCK_INODE(sock), file); @@ -2104,6 +2107,7 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, msg.msg_name = (struct sockaddr *)&address; msg.msg_namelen = addr_len; } + flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; msg.msg_flags = flags; @@ -2449,6 +2453,7 @@ static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys, msg_sys->msg_control = ctl_buf; msg_sys->msg_control_is_user = false; } + flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; msg_sys->msg_flags = flags; if (sock->file->f_flags & O_NONBLOCK) diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 6a7aa603cc21..b48bce15d2b8 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -3521,6 +3521,7 @@ static ssize_t snd_pcm_readv(struct kiocb *iocb, struct iov_iter *to) unsigned long i; void __user **bufs; snd_pcm_uframes_t frames; + const struct iovec *iov = iter_iov(to); pcm_file = iocb->ki_filp->private_data; substream = pcm_file->substream; @@ -3530,18 +3531,20 @@ static ssize_t snd_pcm_readv(struct kiocb *iocb, struct iov_iter *to) if (runtime->state == SNDRV_PCM_STATE_OPEN || runtime->state == SNDRV_PCM_STATE_DISCONNECTED) return -EBADFD; - if (!iter_is_iovec(to)) + if (!to->user_backed) return -EINVAL; if (to->nr_segs > 1024 || to->nr_segs != runtime->channels) return -EINVAL; - if (!frame_aligned(runtime, to->iov->iov_len)) + if (!frame_aligned(runtime, iov->iov_len)) return -EINVAL; - frames = bytes_to_samples(runtime, to->iov->iov_len); + frames = bytes_to_samples(runtime, iov->iov_len); bufs = kmalloc_array(to->nr_segs, sizeof(void *), GFP_KERNEL); if (bufs == NULL) return -ENOMEM; - for (i = 0; i < to->nr_segs; ++i) - bufs[i] = to->iov[i].iov_base; + for (i = 0; i < to->nr_segs; ++i) { + bufs[i] = iov->iov_base; + iov++; + } result = snd_pcm_lib_readv(substream, bufs, frames); if (result > 0) result = frames_to_bytes(runtime, result); @@ -3558,6 +3561,7 @@ static ssize_t snd_pcm_writev(struct kiocb *iocb, struct iov_iter *from) unsigned long i; void __user **bufs; snd_pcm_uframes_t frames; + const struct iovec *iov = iter_iov(from); pcm_file = iocb->ki_filp->private_data; substream = pcm_file->substream; @@ -3567,17 +3571,19 @@ static ssize_t snd_pcm_writev(struct kiocb *iocb, struct iov_iter *from) if (runtime->state == SNDRV_PCM_STATE_OPEN || runtime->state == SNDRV_PCM_STATE_DISCONNECTED) return -EBADFD; - if (!iter_is_iovec(from)) + if (!from->user_backed) return -EINVAL; if (from->nr_segs > 128 || from->nr_segs != runtime->channels || - !frame_aligned(runtime, from->iov->iov_len)) + !frame_aligned(runtime, iov->iov_len)) return -EINVAL; - frames = bytes_to_samples(runtime, from->iov->iov_len); + frames = bytes_to_samples(runtime, iov->iov_len); bufs = kmalloc_array(from->nr_segs, sizeof(void *), GFP_KERNEL); if (bufs == NULL) return -ENOMEM; - for (i = 0; i < from->nr_segs; ++i) - bufs[i] = from->iov[i].iov_base; + for (i = 0; i < from->nr_segs; ++i) { + bufs[i] = iov->iov_base; + iov++; + } result = snd_pcm_lib_writev(substream, bufs, frames); if (result > 0) result = frames_to_bytes(runtime, result); diff --git a/tools/io_uring/Makefile b/tools/io_uring/Makefile deleted file mode 100644 index 00f146c54c53..000000000000 --- a/tools/io_uring/Makefile +++ /dev/null @@ -1,18 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# Makefile for io_uring test tools -CFLAGS += -Wall -Wextra -g -D_GNU_SOURCE -LDLIBS += -lpthread - -all: io_uring-cp io_uring-bench -%: %.c - $(CC) $(CFLAGS) -o $@ $^ - -io_uring-bench: syscall.o io_uring-bench.o - $(CC) $(CFLAGS) -o $@ $^ $(LDLIBS) - -io_uring-cp: setup.o syscall.o queue.o - -clean: - $(RM) io_uring-cp io_uring-bench *.o - -.PHONY: all clean diff --git a/tools/io_uring/README b/tools/io_uring/README deleted file mode 100644 index 67fd70115cff..000000000000 --- a/tools/io_uring/README +++ /dev/null @@ -1,29 +0,0 @@ -This directory includes a few programs that demonstrate how to use io_uring -in an application. The examples are: - -io_uring-cp - A very basic io_uring implementation of cp(1). It takes two - arguments, copies the first argument to the second. This example - is part of liburing, and hence uses the simplified liburing API - for setting up an io_uring instance, submitting IO, completing IO, - etc. The support functions in queue.c and setup.c are straight - out of liburing. - -io_uring-bench - Benchmark program that does random reads on a number of files. This - app demonstrates the various features of io_uring, like fixed files, - fixed buffers, and polled IO. There are options in the program to - control which features to use. Arguments is the file (or files) that - io_uring-bench should operate on. This uses the raw io_uring - interface. - -liburing can be cloned with git here: - - git://git.kernel.dk/liburing - -and contains a number of unit tests as well for testing io_uring. It also -comes with man pages for the three system calls. - -Fio includes an io_uring engine, you can clone fio here: - - git://git.kernel.dk/fio diff --git a/tools/io_uring/barrier.h b/tools/io_uring/barrier.h deleted file mode 100644 index ef00f6722ba9..000000000000 --- a/tools/io_uring/barrier.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef LIBURING_BARRIER_H -#define LIBURING_BARRIER_H - -#if defined(__x86_64) || defined(__i386__) -#define read_barrier() __asm__ __volatile__("":::"memory") -#define write_barrier() __asm__ __volatile__("":::"memory") -#else -/* - * Add arch appropriate definitions. Be safe and use full barriers for - * archs we don't have support for. - */ -#define read_barrier() __sync_synchronize() -#define write_barrier() __sync_synchronize() -#endif - -#endif diff --git a/tools/io_uring/io_uring-bench.c b/tools/io_uring/io_uring-bench.c deleted file mode 100644 index 7703f0118385..000000000000 --- a/tools/io_uring/io_uring-bench.c +++ /dev/null @@ -1,592 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Simple benchmark program that uses the various features of io_uring - * to provide fast random access to a device/file. It has various - * options that are control how we use io_uring, see the OPTIONS section - * below. This uses the raw io_uring interface. - * - * Copyright (C) 2018-2019 Jens Axboe - */ -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "liburing.h" -#include "barrier.h" - -#define min(a, b) ((a < b) ? (a) : (b)) - -struct io_sq_ring { - unsigned *head; - unsigned *tail; - unsigned *ring_mask; - unsigned *ring_entries; - unsigned *flags; - unsigned *array; -}; - -struct io_cq_ring { - unsigned *head; - unsigned *tail; - unsigned *ring_mask; - unsigned *ring_entries; - struct io_uring_cqe *cqes; -}; - -#define DEPTH 128 - -#define BATCH_SUBMIT 32 -#define BATCH_COMPLETE 32 - -#define BS 4096 - -#define MAX_FDS 16 - -static unsigned sq_ring_mask, cq_ring_mask; - -struct file { - unsigned long max_blocks; - unsigned pending_ios; - int real_fd; - int fixed_fd; -}; - -struct submitter { - pthread_t thread; - int ring_fd; - struct drand48_data rand; - struct io_sq_ring sq_ring; - struct io_uring_sqe *sqes; - struct iovec iovecs[DEPTH]; - struct io_cq_ring cq_ring; - int inflight; - unsigned long reaps; - unsigned long done; - unsigned long calls; - volatile int finish; - - __s32 *fds; - - struct file files[MAX_FDS]; - unsigned nr_files; - unsigned cur_file; -}; - -static struct submitter submitters[1]; -static volatile int finish; - -/* - * OPTIONS: Set these to test the various features of io_uring. - */ -static int polled = 1; /* use IO polling */ -static int fixedbufs = 1; /* use fixed user buffers */ -static int register_files = 1; /* use fixed files */ -static int buffered = 0; /* use buffered IO, not O_DIRECT */ -static int sq_thread_poll = 0; /* use kernel submission/poller thread */ -static int sq_thread_cpu = -1; /* pin above thread to this CPU */ -static int do_nop = 0; /* no-op SQ ring commands */ - -static int io_uring_register_buffers(struct submitter *s) -{ - if (do_nop) - return 0; - - return io_uring_register(s->ring_fd, IORING_REGISTER_BUFFERS, s->iovecs, - DEPTH); -} - -static int io_uring_register_files(struct submitter *s) -{ - unsigned i; - - if (do_nop) - return 0; - - s->fds = calloc(s->nr_files, sizeof(__s32)); - for (i = 0; i < s->nr_files; i++) { - s->fds[i] = s->files[i].real_fd; - s->files[i].fixed_fd = i; - } - - return io_uring_register(s->ring_fd, IORING_REGISTER_FILES, s->fds, - s->nr_files); -} - -static int lk_gettid(void) -{ - return syscall(__NR_gettid); -} - -static unsigned file_depth(struct submitter *s) -{ - return (DEPTH + s->nr_files - 1) / s->nr_files; -} - -static void init_io(struct submitter *s, unsigned index) -{ - struct io_uring_sqe *sqe = &s->sqes[index]; - unsigned long offset; - struct file *f; - long r; - - if (do_nop) { - sqe->opcode = IORING_OP_NOP; - return; - } - - if (s->nr_files == 1) { - f = &s->files[0]; - } else { - f = &s->files[s->cur_file]; - if (f->pending_ios >= file_depth(s)) { - s->cur_file++; - if (s->cur_file == s->nr_files) - s->cur_file = 0; - f = &s->files[s->cur_file]; - } - } - f->pending_ios++; - - lrand48_r(&s->rand, &r); - offset = (r % (f->max_blocks - 1)) * BS; - - if (register_files) { - sqe->flags = IOSQE_FIXED_FILE; - sqe->fd = f->fixed_fd; - } else { - sqe->flags = 0; - sqe->fd = f->real_fd; - } - if (fixedbufs) { - sqe->opcode = IORING_OP_READ_FIXED; - sqe->addr = (unsigned long) s->iovecs[index].iov_base; - sqe->len = BS; - sqe->buf_index = index; - } else { - sqe->opcode = IORING_OP_READV; - sqe->addr = (unsigned long) &s->iovecs[index]; - sqe->len = 1; - sqe->buf_index = 0; - } - sqe->ioprio = 0; - sqe->off = offset; - sqe->user_data = (unsigned long) f; -} - -static int prep_more_ios(struct submitter *s, unsigned max_ios) -{ - struct io_sq_ring *ring = &s->sq_ring; - unsigned index, tail, next_tail, prepped = 0; - - next_tail = tail = *ring->tail; - do { - next_tail++; - read_barrier(); - if (next_tail == *ring->head) - break; - - index = tail & sq_ring_mask; - init_io(s, index); - ring->array[index] = index; - prepped++; - tail = next_tail; - } while (prepped < max_ios); - - if (*ring->tail != tail) { - /* order tail store with writes to sqes above */ - write_barrier(); - *ring->tail = tail; - write_barrier(); - } - return prepped; -} - -static int get_file_size(struct file *f) -{ - struct stat st; - - if (fstat(f->real_fd, &st) < 0) - return -1; - if (S_ISBLK(st.st_mode)) { - unsigned long long bytes; - - if (ioctl(f->real_fd, BLKGETSIZE64, &bytes) != 0) - return -1; - - f->max_blocks = bytes / BS; - return 0; - } else if (S_ISREG(st.st_mode)) { - f->max_blocks = st.st_size / BS; - return 0; - } - - return -1; -} - -static int reap_events(struct submitter *s) -{ - struct io_cq_ring *ring = &s->cq_ring; - struct io_uring_cqe *cqe; - unsigned head, reaped = 0; - - head = *ring->head; - do { - struct file *f; - - read_barrier(); - if (head == *ring->tail) - break; - cqe = &ring->cqes[head & cq_ring_mask]; - if (!do_nop) { - f = (struct file *) (uintptr_t) cqe->user_data; - f->pending_ios--; - if (cqe->res != BS) { - printf("io: unexpected ret=%d\n", cqe->res); - if (polled && cqe->res == -EOPNOTSUPP) - printf("Your filesystem doesn't support poll\n"); - return -1; - } - } - reaped++; - head++; - } while (1); - - s->inflight -= reaped; - *ring->head = head; - write_barrier(); - return reaped; -} - -static void *submitter_fn(void *data) -{ - struct submitter *s = data; - struct io_sq_ring *ring = &s->sq_ring; - int ret, prepped; - - printf("submitter=%d\n", lk_gettid()); - - srand48_r(pthread_self(), &s->rand); - - prepped = 0; - do { - int to_wait, to_submit, this_reap, to_prep; - - if (!prepped && s->inflight < DEPTH) { - to_prep = min(DEPTH - s->inflight, BATCH_SUBMIT); - prepped = prep_more_ios(s, to_prep); - } - s->inflight += prepped; -submit_more: - to_submit = prepped; -submit: - if (to_submit && (s->inflight + to_submit <= DEPTH)) - to_wait = 0; - else - to_wait = min(s->inflight + to_submit, BATCH_COMPLETE); - - /* - * Only need to call io_uring_enter if we're not using SQ thread - * poll, or if IORING_SQ_NEED_WAKEUP is set. - */ - if (!sq_thread_poll || (*ring->flags & IORING_SQ_NEED_WAKEUP)) { - unsigned flags = 0; - - if (to_wait) - flags = IORING_ENTER_GETEVENTS; - if ((*ring->flags & IORING_SQ_NEED_WAKEUP)) - flags |= IORING_ENTER_SQ_WAKEUP; - ret = io_uring_enter(s->ring_fd, to_submit, to_wait, - flags, NULL); - s->calls++; - } - - /* - * For non SQ thread poll, we already got the events we needed - * through the io_uring_enter() above. For SQ thread poll, we - * need to loop here until we find enough events. - */ - this_reap = 0; - do { - int r; - r = reap_events(s); - if (r == -1) { - s->finish = 1; - break; - } else if (r > 0) - this_reap += r; - } while (sq_thread_poll && this_reap < to_wait); - s->reaps += this_reap; - - if (ret >= 0) { - if (!ret) { - to_submit = 0; - if (s->inflight) - goto submit; - continue; - } else if (ret < to_submit) { - int diff = to_submit - ret; - - s->done += ret; - prepped -= diff; - goto submit_more; - } - s->done += ret; - prepped = 0; - continue; - } else if (ret < 0) { - if (errno == EAGAIN) { - if (s->finish) - break; - if (this_reap) - goto submit; - to_submit = 0; - goto submit; - } - printf("io_submit: %s\n", strerror(errno)); - break; - } - } while (!s->finish); - - finish = 1; - return NULL; -} - -static void sig_int(int sig) -{ - printf("Exiting on signal %d\n", sig); - submitters[0].finish = 1; - finish = 1; -} - -static void arm_sig_int(void) -{ - struct sigaction act; - - memset(&act, 0, sizeof(act)); - act.sa_handler = sig_int; - act.sa_flags = SA_RESTART; - sigaction(SIGINT, &act, NULL); -} - -static int setup_ring(struct submitter *s) -{ - struct io_sq_ring *sring = &s->sq_ring; - struct io_cq_ring *cring = &s->cq_ring; - struct io_uring_params p; - int ret, fd; - void *ptr; - - memset(&p, 0, sizeof(p)); - - if (polled && !do_nop) - p.flags |= IORING_SETUP_IOPOLL; - if (sq_thread_poll) { - p.flags |= IORING_SETUP_SQPOLL; - if (sq_thread_cpu != -1) { - p.flags |= IORING_SETUP_SQ_AFF; - p.sq_thread_cpu = sq_thread_cpu; - } - } - - fd = io_uring_setup(DEPTH, &p); - if (fd < 0) { - perror("io_uring_setup"); - return 1; - } - s->ring_fd = fd; - - if (fixedbufs) { - ret = io_uring_register_buffers(s); - if (ret < 0) { - perror("io_uring_register_buffers"); - return 1; - } - } - - if (register_files) { - ret = io_uring_register_files(s); - if (ret < 0) { - perror("io_uring_register_files"); - return 1; - } - } - - ptr = mmap(0, p.sq_off.array + p.sq_entries * sizeof(__u32), - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, - IORING_OFF_SQ_RING); - printf("sq_ring ptr = 0x%p\n", ptr); - sring->head = ptr + p.sq_off.head; - sring->tail = ptr + p.sq_off.tail; - sring->ring_mask = ptr + p.sq_off.ring_mask; - sring->ring_entries = ptr + p.sq_off.ring_entries; - sring->flags = ptr + p.sq_off.flags; - sring->array = ptr + p.sq_off.array; - sq_ring_mask = *sring->ring_mask; - - s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe), - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, - IORING_OFF_SQES); - printf("sqes ptr = 0x%p\n", s->sqes); - - ptr = mmap(0, p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe), - PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, - IORING_OFF_CQ_RING); - printf("cq_ring ptr = 0x%p\n", ptr); - cring->head = ptr + p.cq_off.head; - cring->tail = ptr + p.cq_off.tail; - cring->ring_mask = ptr + p.cq_off.ring_mask; - cring->ring_entries = ptr + p.cq_off.ring_entries; - cring->cqes = ptr + p.cq_off.cqes; - cq_ring_mask = *cring->ring_mask; - return 0; -} - -static void file_depths(char *buf) -{ - struct submitter *s = &submitters[0]; - unsigned i; - char *p; - - buf[0] = '\0'; - p = buf; - for (i = 0; i < s->nr_files; i++) { - struct file *f = &s->files[i]; - - if (i + 1 == s->nr_files) - p += sprintf(p, "%d", f->pending_ios); - else - p += sprintf(p, "%d, ", f->pending_ios); - } -} - -int main(int argc, char *argv[]) -{ - struct submitter *s = &submitters[0]; - unsigned long done, calls, reap; - int err, i, flags, fd; - char *fdepths; - void *ret; - - if (!do_nop && argc < 2) { - printf("%s: filename\n", argv[0]); - return 1; - } - - flags = O_RDONLY | O_NOATIME; - if (!buffered) - flags |= O_DIRECT; - - i = 1; - while (!do_nop && i < argc) { - struct file *f; - - if (s->nr_files == MAX_FDS) { - printf("Max number of files (%d) reached\n", MAX_FDS); - break; - } - fd = open(argv[i], flags); - if (fd < 0) { - perror("open"); - return 1; - } - - f = &s->files[s->nr_files]; - f->real_fd = fd; - if (get_file_size(f)) { - printf("failed getting size of device/file\n"); - return 1; - } - if (f->max_blocks <= 1) { - printf("Zero file/device size?\n"); - return 1; - } - f->max_blocks--; - - printf("Added file %s\n", argv[i]); - s->nr_files++; - i++; - } - - if (fixedbufs) { - struct rlimit rlim; - - rlim.rlim_cur = RLIM_INFINITY; - rlim.rlim_max = RLIM_INFINITY; - if (setrlimit(RLIMIT_MEMLOCK, &rlim) < 0) { - perror("setrlimit"); - return 1; - } - } - - arm_sig_int(); - - for (i = 0; i < DEPTH; i++) { - void *buf; - - if (posix_memalign(&buf, BS, BS)) { - printf("failed alloc\n"); - return 1; - } - s->iovecs[i].iov_base = buf; - s->iovecs[i].iov_len = BS; - } - - err = setup_ring(s); - if (err) { - printf("ring setup failed: %s, %d\n", strerror(errno), err); - return 1; - } - printf("polled=%d, fixedbufs=%d, buffered=%d", polled, fixedbufs, buffered); - printf(" QD=%d, sq_ring=%d, cq_ring=%d\n", DEPTH, *s->sq_ring.ring_entries, *s->cq_ring.ring_entries); - - pthread_create(&s->thread, NULL, submitter_fn, s); - - fdepths = malloc(8 * s->nr_files); - reap = calls = done = 0; - do { - unsigned long this_done = 0; - unsigned long this_reap = 0; - unsigned long this_call = 0; - unsigned long rpc = 0, ipc = 0; - - sleep(1); - this_done += s->done; - this_call += s->calls; - this_reap += s->reaps; - if (this_call - calls) { - rpc = (this_done - done) / (this_call - calls); - ipc = (this_reap - reap) / (this_call - calls); - } else - rpc = ipc = -1; - file_depths(fdepths); - printf("IOPS=%lu, IOS/call=%ld/%ld, inflight=%u (%s)\n", - this_done - done, rpc, ipc, s->inflight, - fdepths); - done = this_done; - calls = this_call; - reap = this_reap; - } while (!finish); - - pthread_join(s->thread, &ret); - close(s->ring_fd); - free(fdepths); - return 0; -} diff --git a/tools/io_uring/io_uring-cp.c b/tools/io_uring/io_uring-cp.c deleted file mode 100644 index d9bd6f5f8f46..000000000000 --- a/tools/io_uring/io_uring-cp.c +++ /dev/null @@ -1,283 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Simple test program that demonstrates a file copy through io_uring. This - * uses the API exposed by liburing. - * - * Copyright (C) 2018-2019 Jens Axboe - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "liburing.h" - -#define QD 64 -#define BS (32*1024) - -static int infd, outfd; - -struct io_data { - int read; - off_t first_offset, offset; - size_t first_len; - struct iovec iov; -}; - -static int setup_context(unsigned entries, struct io_uring *ring) -{ - int ret; - - ret = io_uring_queue_init(entries, ring, 0); - if (ret < 0) { - fprintf(stderr, "queue_init: %s\n", strerror(-ret)); - return -1; - } - - return 0; -} - -static int get_file_size(int fd, off_t *size) -{ - struct stat st; - - if (fstat(fd, &st) < 0) - return -1; - if (S_ISREG(st.st_mode)) { - *size = st.st_size; - return 0; - } else if (S_ISBLK(st.st_mode)) { - unsigned long long bytes; - - if (ioctl(fd, BLKGETSIZE64, &bytes) != 0) - return -1; - - *size = bytes; - return 0; - } - - return -1; -} - -static void queue_prepped(struct io_uring *ring, struct io_data *data) -{ - struct io_uring_sqe *sqe; - - sqe = io_uring_get_sqe(ring); - assert(sqe); - - if (data->read) - io_uring_prep_readv(sqe, infd, &data->iov, 1, data->offset); - else - io_uring_prep_writev(sqe, outfd, &data->iov, 1, data->offset); - - io_uring_sqe_set_data(sqe, data); -} - -static int queue_read(struct io_uring *ring, off_t size, off_t offset) -{ - struct io_uring_sqe *sqe; - struct io_data *data; - - data = malloc(size + sizeof(*data)); - if (!data) - return 1; - - sqe = io_uring_get_sqe(ring); - if (!sqe) { - free(data); - return 1; - } - - data->read = 1; - data->offset = data->first_offset = offset; - - data->iov.iov_base = data + 1; - data->iov.iov_len = size; - data->first_len = size; - - io_uring_prep_readv(sqe, infd, &data->iov, 1, offset); - io_uring_sqe_set_data(sqe, data); - return 0; -} - -static void queue_write(struct io_uring *ring, struct io_data *data) -{ - data->read = 0; - data->offset = data->first_offset; - - data->iov.iov_base = data + 1; - data->iov.iov_len = data->first_len; - - queue_prepped(ring, data); - io_uring_submit(ring); -} - -static int copy_file(struct io_uring *ring, off_t insize) -{ - unsigned long reads, writes; - struct io_uring_cqe *cqe; - off_t write_left, offset; - int ret; - - write_left = insize; - writes = reads = offset = 0; - - while (insize || write_left) { - int had_reads, got_comp; - - /* - * Queue up as many reads as we can - */ - had_reads = reads; - while (insize) { - off_t this_size = insize; - - if (reads + writes >= QD) - break; - if (this_size > BS) - this_size = BS; - else if (!this_size) - break; - - if (queue_read(ring, this_size, offset)) - break; - - insize -= this_size; - offset += this_size; - reads++; - } - - if (had_reads != reads) { - ret = io_uring_submit(ring); - if (ret < 0) { - fprintf(stderr, "io_uring_submit: %s\n", strerror(-ret)); - break; - } - } - - /* - * Queue is full at this point. Find at least one completion. - */ - got_comp = 0; - while (write_left) { - struct io_data *data; - - if (!got_comp) { - ret = io_uring_wait_cqe(ring, &cqe); - got_comp = 1; - } else { - ret = io_uring_peek_cqe(ring, &cqe); - if (ret == -EAGAIN) { - cqe = NULL; - ret = 0; - } - } - if (ret < 0) { - fprintf(stderr, "io_uring_peek_cqe: %s\n", - strerror(-ret)); - return 1; - } - if (!cqe) - break; - - data = io_uring_cqe_get_data(cqe); - if (cqe->res < 0) { - if (cqe->res == -EAGAIN) { - queue_prepped(ring, data); - io_uring_cqe_seen(ring, cqe); - continue; - } - fprintf(stderr, "cqe failed: %s\n", - strerror(-cqe->res)); - return 1; - } else if (cqe->res != data->iov.iov_len) { - /* Short read/write, adjust and requeue */ - data->iov.iov_base += cqe->res; - data->iov.iov_len -= cqe->res; - data->offset += cqe->res; - queue_prepped(ring, data); - io_uring_cqe_seen(ring, cqe); - continue; - } - - /* - * All done. if write, nothing else to do. if read, - * queue up corresponding write. - */ - if (data->read) { - queue_write(ring, data); - write_left -= data->first_len; - reads--; - writes++; - } else { - free(data); - writes--; - } - io_uring_cqe_seen(ring, cqe); - } - } - - /* wait out pending writes */ - while (writes) { - struct io_data *data; - - ret = io_uring_wait_cqe(ring, &cqe); - if (ret) { - fprintf(stderr, "wait_cqe=%d\n", ret); - return 1; - } - if (cqe->res < 0) { - fprintf(stderr, "write res=%d\n", cqe->res); - return 1; - } - data = io_uring_cqe_get_data(cqe); - free(data); - writes--; - io_uring_cqe_seen(ring, cqe); - } - - return 0; -} - -int main(int argc, char *argv[]) -{ - struct io_uring ring; - off_t insize; - int ret; - - if (argc < 3) { - printf("%s: infile outfile\n", argv[0]); - return 1; - } - - infd = open(argv[1], O_RDONLY); - if (infd < 0) { - perror("open infile"); - return 1; - } - outfd = open(argv[2], O_WRONLY | O_CREAT | O_TRUNC, 0644); - if (outfd < 0) { - perror("open outfile"); - return 1; - } - - if (setup_context(QD, &ring)) - return 1; - if (get_file_size(infd, &insize)) - return 1; - - ret = copy_file(&ring, insize); - - close(infd); - close(outfd); - io_uring_queue_exit(&ring); - return ret; -} diff --git a/tools/io_uring/liburing.h b/tools/io_uring/liburing.h deleted file mode 100644 index 28a837b6069d..000000000000 --- a/tools/io_uring/liburing.h +++ /dev/null @@ -1,187 +0,0 @@ -#ifndef LIB_URING_H -#define LIB_URING_H - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include -#include -#include "../../include/uapi/linux/io_uring.h" -#include -#include -#include "barrier.h" - -/* - * Library interface to io_uring - */ -struct io_uring_sq { - unsigned *khead; - unsigned *ktail; - unsigned *kring_mask; - unsigned *kring_entries; - unsigned *kflags; - unsigned *kdropped; - unsigned *array; - struct io_uring_sqe *sqes; - - unsigned sqe_head; - unsigned sqe_tail; - - size_t ring_sz; -}; - -struct io_uring_cq { - unsigned *khead; - unsigned *ktail; - unsigned *kring_mask; - unsigned *kring_entries; - unsigned *koverflow; - struct io_uring_cqe *cqes; - - size_t ring_sz; -}; - -struct io_uring { - struct io_uring_sq sq; - struct io_uring_cq cq; - int ring_fd; -}; - -/* - * System calls - */ -extern int io_uring_setup(unsigned entries, struct io_uring_params *p); -extern int io_uring_enter(int fd, unsigned to_submit, - unsigned min_complete, unsigned flags, sigset_t *sig); -extern int io_uring_register(int fd, unsigned int opcode, void *arg, - unsigned int nr_args); - -/* - * Library interface - */ -extern int io_uring_queue_init(unsigned entries, struct io_uring *ring, - unsigned flags); -extern int io_uring_queue_mmap(int fd, struct io_uring_params *p, - struct io_uring *ring); -extern void io_uring_queue_exit(struct io_uring *ring); -extern int io_uring_peek_cqe(struct io_uring *ring, - struct io_uring_cqe **cqe_ptr); -extern int io_uring_wait_cqe(struct io_uring *ring, - struct io_uring_cqe **cqe_ptr); -extern int io_uring_submit(struct io_uring *ring); -extern struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring); - -/* - * Must be called after io_uring_{peek,wait}_cqe() after the cqe has - * been processed by the application. - */ -static inline void io_uring_cqe_seen(struct io_uring *ring, - struct io_uring_cqe *cqe) -{ - if (cqe) { - struct io_uring_cq *cq = &ring->cq; - - (*cq->khead)++; - /* - * Ensure that the kernel sees our new head, the kernel has - * the matching read barrier. - */ - write_barrier(); - } -} - -/* - * Command prep helpers - */ -static inline void io_uring_sqe_set_data(struct io_uring_sqe *sqe, void *data) -{ - sqe->user_data = (unsigned long) data; -} - -static inline void *io_uring_cqe_get_data(struct io_uring_cqe *cqe) -{ - return (void *) (uintptr_t) cqe->user_data; -} - -static inline void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd, - const void *addr, unsigned len, - off_t offset) -{ - memset(sqe, 0, sizeof(*sqe)); - sqe->opcode = op; - sqe->fd = fd; - sqe->off = offset; - sqe->addr = (unsigned long) addr; - sqe->len = len; -} - -static inline void io_uring_prep_readv(struct io_uring_sqe *sqe, int fd, - const struct iovec *iovecs, - unsigned nr_vecs, off_t offset) -{ - io_uring_prep_rw(IORING_OP_READV, sqe, fd, iovecs, nr_vecs, offset); -} - -static inline void io_uring_prep_read_fixed(struct io_uring_sqe *sqe, int fd, - void *buf, unsigned nbytes, - off_t offset) -{ - io_uring_prep_rw(IORING_OP_READ_FIXED, sqe, fd, buf, nbytes, offset); -} - -static inline void io_uring_prep_writev(struct io_uring_sqe *sqe, int fd, - const struct iovec *iovecs, - unsigned nr_vecs, off_t offset) -{ - io_uring_prep_rw(IORING_OP_WRITEV, sqe, fd, iovecs, nr_vecs, offset); -} - -static inline void io_uring_prep_write_fixed(struct io_uring_sqe *sqe, int fd, - const void *buf, unsigned nbytes, - off_t offset) -{ - io_uring_prep_rw(IORING_OP_WRITE_FIXED, sqe, fd, buf, nbytes, offset); -} - -static inline void io_uring_prep_poll_add(struct io_uring_sqe *sqe, int fd, - unsigned poll_mask) -{ - memset(sqe, 0, sizeof(*sqe)); - sqe->opcode = IORING_OP_POLL_ADD; - sqe->fd = fd; -#if __BYTE_ORDER == __BIG_ENDIAN - poll_mask = __swahw32(poll_mask); -#endif - sqe->poll_events = poll_mask; -} - -static inline void io_uring_prep_poll_remove(struct io_uring_sqe *sqe, - void *user_data) -{ - memset(sqe, 0, sizeof(*sqe)); - sqe->opcode = IORING_OP_POLL_REMOVE; - sqe->addr = (unsigned long) user_data; -} - -static inline void io_uring_prep_fsync(struct io_uring_sqe *sqe, int fd, - unsigned fsync_flags) -{ - memset(sqe, 0, sizeof(*sqe)); - sqe->opcode = IORING_OP_FSYNC; - sqe->fd = fd; - sqe->fsync_flags = fsync_flags; -} - -static inline void io_uring_prep_nop(struct io_uring_sqe *sqe) -{ - memset(sqe, 0, sizeof(*sqe)); - sqe->opcode = IORING_OP_NOP; -} - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/tools/io_uring/queue.c b/tools/io_uring/queue.c deleted file mode 100644 index 321819c132c7..000000000000 --- a/tools/io_uring/queue.c +++ /dev/null @@ -1,156 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "liburing.h" -#include "barrier.h" - -static int __io_uring_get_cqe(struct io_uring *ring, - struct io_uring_cqe **cqe_ptr, int wait) -{ - struct io_uring_cq *cq = &ring->cq; - const unsigned mask = *cq->kring_mask; - unsigned head; - int ret; - - *cqe_ptr = NULL; - head = *cq->khead; - do { - /* - * It's necessary to use a read_barrier() before reading - * the CQ tail, since the kernel updates it locklessly. The - * kernel has the matching store barrier for the update. The - * kernel also ensures that previous stores to CQEs are ordered - * with the tail update. - */ - read_barrier(); - if (head != *cq->ktail) { - *cqe_ptr = &cq->cqes[head & mask]; - break; - } - if (!wait) - break; - ret = io_uring_enter(ring->ring_fd, 0, 1, - IORING_ENTER_GETEVENTS, NULL); - if (ret < 0) - return -errno; - } while (1); - - return 0; -} - -/* - * Return an IO completion, if one is readily available. Returns 0 with - * cqe_ptr filled in on success, -errno on failure. - */ -int io_uring_peek_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr) -{ - return __io_uring_get_cqe(ring, cqe_ptr, 0); -} - -/* - * Return an IO completion, waiting for it if necessary. Returns 0 with - * cqe_ptr filled in on success, -errno on failure. - */ -int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr) -{ - return __io_uring_get_cqe(ring, cqe_ptr, 1); -} - -/* - * Submit sqes acquired from io_uring_get_sqe() to the kernel. - * - * Returns number of sqes submitted - */ -int io_uring_submit(struct io_uring *ring) -{ - struct io_uring_sq *sq = &ring->sq; - const unsigned mask = *sq->kring_mask; - unsigned ktail, ktail_next, submitted, to_submit; - int ret; - - /* - * If we have pending IO in the kring, submit it first. We need a - * read barrier here to match the kernels store barrier when updating - * the SQ head. - */ - read_barrier(); - if (*sq->khead != *sq->ktail) { - submitted = *sq->kring_entries; - goto submit; - } - - if (sq->sqe_head == sq->sqe_tail) - return 0; - - /* - * Fill in sqes that we have queued up, adding them to the kernel ring - */ - submitted = 0; - ktail = ktail_next = *sq->ktail; - to_submit = sq->sqe_tail - sq->sqe_head; - while (to_submit--) { - ktail_next++; - read_barrier(); - - sq->array[ktail & mask] = sq->sqe_head & mask; - ktail = ktail_next; - - sq->sqe_head++; - submitted++; - } - - if (!submitted) - return 0; - - if (*sq->ktail != ktail) { - /* - * First write barrier ensures that the SQE stores are updated - * with the tail update. This is needed so that the kernel - * will never see a tail update without the preceeding sQE - * stores being done. - */ - write_barrier(); - *sq->ktail = ktail; - /* - * The kernel has the matching read barrier for reading the - * SQ tail. - */ - write_barrier(); - } - -submit: - ret = io_uring_enter(ring->ring_fd, submitted, 0, - IORING_ENTER_GETEVENTS, NULL); - if (ret < 0) - return -errno; - - return ret; -} - -/* - * Return an sqe to fill. Application must later call io_uring_submit() - * when it's ready to tell the kernel about it. The caller may call this - * function multiple times before calling io_uring_submit(). - * - * Returns a vacant sqe, or NULL if we're full. - */ -struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring) -{ - struct io_uring_sq *sq = &ring->sq; - unsigned next = sq->sqe_tail + 1; - struct io_uring_sqe *sqe; - - /* - * All sqes are used - */ - if (next - sq->sqe_head > *sq->kring_entries) - return NULL; - - sqe = &sq->sqes[sq->sqe_tail & *sq->kring_mask]; - sq->sqe_tail = next; - return sqe; -} diff --git a/tools/io_uring/setup.c b/tools/io_uring/setup.c deleted file mode 100644 index 0b50fcd78520..000000000000 --- a/tools/io_uring/setup.c +++ /dev/null @@ -1,107 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "liburing.h" - -static int io_uring_mmap(int fd, struct io_uring_params *p, - struct io_uring_sq *sq, struct io_uring_cq *cq) -{ - size_t size; - void *ptr; - int ret; - - sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned); - ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING); - if (ptr == MAP_FAILED) - return -errno; - sq->khead = ptr + p->sq_off.head; - sq->ktail = ptr + p->sq_off.tail; - sq->kring_mask = ptr + p->sq_off.ring_mask; - sq->kring_entries = ptr + p->sq_off.ring_entries; - sq->kflags = ptr + p->sq_off.flags; - sq->kdropped = ptr + p->sq_off.dropped; - sq->array = ptr + p->sq_off.array; - - size = p->sq_entries * sizeof(struct io_uring_sqe); - sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, fd, - IORING_OFF_SQES); - if (sq->sqes == MAP_FAILED) { - ret = -errno; -err: - munmap(sq->khead, sq->ring_sz); - return ret; - } - - cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe); - ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING); - if (ptr == MAP_FAILED) { - ret = -errno; - munmap(sq->sqes, p->sq_entries * sizeof(struct io_uring_sqe)); - goto err; - } - cq->khead = ptr + p->cq_off.head; - cq->ktail = ptr + p->cq_off.tail; - cq->kring_mask = ptr + p->cq_off.ring_mask; - cq->kring_entries = ptr + p->cq_off.ring_entries; - cq->koverflow = ptr + p->cq_off.overflow; - cq->cqes = ptr + p->cq_off.cqes; - return 0; -} - -/* - * For users that want to specify sq_thread_cpu or sq_thread_idle, this - * interface is a convenient helper for mmap()ing the rings. - * Returns -1 on error, or zero on success. On success, 'ring' - * contains the necessary information to read/write to the rings. - */ -int io_uring_queue_mmap(int fd, struct io_uring_params *p, struct io_uring *ring) -{ - int ret; - - memset(ring, 0, sizeof(*ring)); - ret = io_uring_mmap(fd, p, &ring->sq, &ring->cq); - if (!ret) - ring->ring_fd = fd; - return ret; -} - -/* - * Returns -1 on error, or zero on success. On success, 'ring' - * contains the necessary information to read/write to the rings. - */ -int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags) -{ - struct io_uring_params p; - int fd, ret; - - memset(&p, 0, sizeof(p)); - p.flags = flags; - - fd = io_uring_setup(entries, &p); - if (fd < 0) - return fd; - - ret = io_uring_queue_mmap(fd, &p, ring); - if (ret) - close(fd); - - return ret; -} - -void io_uring_queue_exit(struct io_uring *ring) -{ - struct io_uring_sq *sq = &ring->sq; - struct io_uring_cq *cq = &ring->cq; - - munmap(sq->sqes, *sq->kring_entries * sizeof(struct io_uring_sqe)); - munmap(sq->khead, sq->ring_sz); - munmap(cq->khead, cq->ring_sz); - close(ring->ring_fd); -} diff --git a/tools/io_uring/syscall.c b/tools/io_uring/syscall.c deleted file mode 100644 index b22e0aa54e9d..000000000000 --- a/tools/io_uring/syscall.c +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Will go away once libc support is there - */ -#include -#include -#include -#include -#include "liburing.h" - -#ifdef __alpha__ -/* - * alpha is the only exception, all other architectures - * have common numbers for new system calls. - */ -# ifndef __NR_io_uring_setup -# define __NR_io_uring_setup 535 -# endif -# ifndef __NR_io_uring_enter -# define __NR_io_uring_enter 536 -# endif -# ifndef __NR_io_uring_register -# define __NR_io_uring_register 537 -# endif -#else /* !__alpha__ */ -# ifndef __NR_io_uring_setup -# define __NR_io_uring_setup 425 -# endif -# ifndef __NR_io_uring_enter -# define __NR_io_uring_enter 426 -# endif -# ifndef __NR_io_uring_register -# define __NR_io_uring_register 427 -# endif -#endif - -int io_uring_register(int fd, unsigned int opcode, void *arg, - unsigned int nr_args) -{ - return syscall(__NR_io_uring_register, fd, opcode, arg, nr_args); -} - -int io_uring_setup(unsigned int entries, struct io_uring_params *p) -{ - return syscall(__NR_io_uring_setup, entries, p); -} - -int io_uring_enter(int fd, unsigned int to_submit, unsigned int min_complete, - unsigned int flags, sigset_t *sig) -{ - return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, - flags, sig, _NSIG / 8); -}