Merge: io_uring: update to upstream v6.6

MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/3318

Update io_uring and its dependencies to upstream kernel version 6.6.

JIRA: https://issues.redhat.com/browse/RHEL-12076
JIRA: https://issues.redhat.com/browse/RHEL-14998
JIRA: https://issues.redhat.com/browse/RHEL-4447
CVE: CVE-2023-46862

Omitted-Fix: ab69838e7c75 ("io_uring/kbuf: Fix check of BID wrapping in provided buffers")
Omitted-Fix: f74c746e476b ("io_uring/kbuf: Allow the full buffer id space for provided buffers")

This is the list of new features available (includes upstream kernel versions 6.3-6.6):

    User-specified ring buffer
    Provided Buffers allocated by the kernel
    Ability to register the ring fd
    Multi-shot timeouts
    ability to pass custom flags to the completion queue entry for ring messages

All of these features are covered by the liburing tests.

In my testing, no-mmap-inval.t failed because of a broken test.  socket-uring-cmd.t also failed because of a missing selinux policy rule.  Try running audit2allow if you see a failure in that test.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>

Approved-by: Wander Lairson Costa <wander@redhat.com>
Approved-by: Donald Dutile <ddutile@redhat.com>
Approved-by: Chris von Recklinghausen <crecklin@redhat.com>
Approved-by: Jiri Benc <jbenc@redhat.com>
Approved-by: Ming Lei <ming.lei@redhat.com>

Signed-off-by: Scott Weaver <scweaver@redhat.com>
This commit is contained in:
Scott Weaver 2023-12-16 14:38:47 -05:00
commit 8d95883db0
112 changed files with 3236 additions and 3674 deletions

View File

@ -448,17 +448,26 @@ io_uring_disabled
Prevents all processes from creating new io_uring instances. Enabling this Prevents all processes from creating new io_uring instances. Enabling this
shrinks the kernel's attack surface. shrinks the kernel's attack surface.
= ================================================================== = ======================================================================
0 All processes can create io_uring instances as normal. This is the 0 All processes can create io_uring instances as normal.
default setting. 1 io_uring creation is disabled (io_uring_setup() will fail with
1 io_uring creation is disabled for unprivileged processes. -EPERM) for unprivileged processes not in the io_uring_group group.
io_uring_setup fails with -EPERM unless the calling process is Existing io_uring instances can still be used. See the
privileged (CAP_SYS_ADMIN). Existing io_uring instances can documentation for io_uring_group for more information.
still be used. 2 io_uring creation is disabled for all processes. io_uring_setup()
2 io_uring creation is disabled for all processes. io_uring_setup
always fails with -EPERM. Existing io_uring instances can still be always fails with -EPERM. Existing io_uring instances can still be
used. used. This is the default setting.
= ================================================================== = ======================================================================
io_uring_group
==============
When io_uring_disabled is set to 1, a process must either be
privileged (CAP_SYS_ADMIN) or be in the io_uring_group group in order
to create an io_uring instance. If io_uring_group is set to -1 (the
default), only processes with the CAP_SYS_ADMIN capability may create
io_uring instances.
kexec_load_disabled kexec_load_disabled

View File

@ -10086,7 +10086,6 @@ F: io_uring/
F: include/linux/io_uring.h F: include/linux/io_uring.h
F: include/linux/io_uring_types.h F: include/linux/io_uring_types.h
F: include/uapi/linux/io_uring.h F: include/uapi/linux/io_uring.h
F: tools/io_uring/
IPMI SUBSYSTEM IPMI SUBSYSTEM
M: Corey Minyard <minyard@acm.org> M: Corey Minyard <minyard@acm.org>

View File

@ -29,9 +29,11 @@ static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data,
bmd = kmalloc(struct_size(bmd, iov, data->nr_segs), gfp_mask); bmd = kmalloc(struct_size(bmd, iov, data->nr_segs), gfp_mask);
if (!bmd) if (!bmd)
return NULL; return NULL;
memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs);
bmd->iter = *data; bmd->iter = *data;
bmd->iter.iov = bmd->iov; if (iter_is_iovec(data)) {
memcpy(bmd->iov, iter_iov(data), sizeof(struct iovec) * data->nr_segs);
bmd->iter.__iov = bmd->iov;
}
return bmd; return bmd;
} }
@ -636,7 +638,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
copy = true; copy = true;
else if (iov_iter_is_bvec(iter)) else if (iov_iter_is_bvec(iter))
map_bvec = true; map_bvec = true;
else if (!iter_is_iovec(iter)) else if (!user_backed_iter(iter))
copy = true; copy = true;
else if (queue_virt_boundary(q)) else if (queue_virt_boundary(q))
copy = queue_virt_boundary(q) & iov_iter_gap_alignment(iter); copy = queue_virt_boundary(q) & iov_iter_gap_alignment(iter);
@ -677,9 +679,8 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
struct rq_map_data *map_data, void __user *ubuf, struct rq_map_data *map_data, void __user *ubuf,
unsigned long len, gfp_t gfp_mask) unsigned long len, gfp_t gfp_mask)
{ {
struct iovec iov;
struct iov_iter i; struct iov_iter i;
int ret = import_single_range(rq_data_dir(rq), ubuf, len, &iov, &i); int ret = import_ubuf(rq_data_dir(rq), ubuf, len, &i);
if (unlikely(ret < 0)) if (unlikely(ret < 0))
return ret; return ret;

View File

@ -506,7 +506,7 @@ static int blkdev_open(struct inode *inode, struct file *filp)
* during an unstable branch. * during an unstable branch.
*/ */
filp->f_flags |= O_LARGEFILE; filp->f_flags |= O_LARGEFILE;
filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; filp->f_mode |= FMODE_BUF_RASYNC;
/* /*
* Use the file private data to store the holder for exclusive openes. * Use the file private data to store the holder for exclusive openes.
@ -520,6 +520,9 @@ static int blkdev_open(struct inode *inode, struct file *filp)
if (IS_ERR(bdev)) if (IS_ERR(bdev))
return PTR_ERR(bdev); return PTR_ERR(bdev);
if (bdev_nowait(bdev))
filp->f_mode |= FMODE_NOWAIT;
filp->f_mapping = bdev->bd_inode->i_mapping; filp->f_mapping = bdev->bd_inode->i_mapping;
filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
return 0; return 0;

View File

@ -362,7 +362,7 @@ static unsigned zero_mmap_capabilities(struct file *file)
/* can't do an in-place private mapping if there's no MMU */ /* can't do an in-place private mapping if there's no MMU */
static inline int private_mapping_ok(struct vm_area_struct *vma) static inline int private_mapping_ok(struct vm_area_struct *vma)
{ {
return vma->vm_flags & VM_MAYSHARE; return is_nommu_shared_mapping(vma->vm_flags);
} }
#else #else

View File

@ -284,11 +284,12 @@ static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from)
} }
while (dim) { while (dim) {
const struct iovec *iov = iter_iov(from);
int ret; int ret;
unsigned long count = 0; unsigned long count = 0;
ret = hfi1_user_sdma_process_request( ret = hfi1_user_sdma_process_request(
fd, (struct iovec *)(from->iov + done), fd, (struct iovec *)(iov + done),
dim, &count); dim, &count);
if (ret) { if (ret) {
reqs = ret; reqs = ret;

View File

@ -2246,10 +2246,10 @@ static ssize_t qib_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct qib_ctxtdata *rcd = ctxt_fp(iocb->ki_filp); struct qib_ctxtdata *rcd = ctxt_fp(iocb->ki_filp);
struct qib_user_sdma_queue *pq = fp->pq; struct qib_user_sdma_queue *pq = fp->pq;
if (!iter_is_iovec(from) || !from->nr_segs || !pq) if (!from->user_backed || !from->nr_segs || !pq)
return -EINVAL; return -EINVAL;
return qib_user_sdma_writev(rcd, pq, from->iov, from->nr_segs); return qib_user_sdma_writev(rcd, pq, iter_iov(from), from->nr_segs);
} }
static struct class *qib_class; static struct class *qib_class;

View File

@ -1473,7 +1473,8 @@ static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
skb->truesize += skb->data_len; skb->truesize += skb->data_len;
for (i = 1; i < it->nr_segs; i++) { for (i = 1; i < it->nr_segs; i++) {
size_t fragsz = it->iov[i].iov_len; const struct iovec *iov = iter_iov(it);
size_t fragsz = iov->iov_len;
struct page *page; struct page *page;
void *frag; void *frag;

View File

@ -551,7 +551,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec) struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec)
{ {
struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd); struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
const struct nvme_uring_cmd *cmd = ioucmd->cmd; const struct nvme_uring_cmd *cmd = io_uring_sqe_cmd(ioucmd->sqe);
struct request_queue *q = ns ? ns->queue : ctrl->admin_q; struct request_queue *q = ns ? ns->queue : ctrl->admin_q;
struct nvme_uring_data d; struct nvme_uring_data d;
struct nvme_command c; struct nvme_command c;

View File

@ -1246,7 +1246,7 @@ static ssize_t ffs_epfile_read_iter(struct kiocb *kiocb, struct iov_iter *to)
p->kiocb = kiocb; p->kiocb = kiocb;
if (p->aio) { if (p->aio) {
p->to_free = dup_iter(&p->data, to, GFP_KERNEL); p->to_free = dup_iter(&p->data, to, GFP_KERNEL);
if (!p->to_free) { if (!iter_is_ubuf(&p->data) && !p->to_free) {
kfree(p); kfree(p);
return -ENOMEM; return -ENOMEM;
} }

View File

@ -613,7 +613,7 @@ ep_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (!priv) if (!priv)
goto fail; goto fail;
priv->to_free = dup_iter(&priv->to, to, GFP_KERNEL); priv->to_free = dup_iter(&priv->to, to, GFP_KERNEL);
if (!priv->to_free) { if (!iter_is_ubuf(&priv->to) && !priv->to_free) {
kfree(priv); kfree(priv);
goto fail; goto fail;
} }

View File

@ -641,7 +641,7 @@ vhost_scsi_calc_sgls(struct iov_iter *iter, size_t bytes, int max_sgls)
{ {
int sgl_count = 0; int sgl_count = 0;
if (!iter || !iter->iov) { if (!iter || !iter_iov(iter)) {
pr_err("%s: iter->iov is NULL, but expected bytes: %zu" pr_err("%s: iter->iov is NULL, but expected bytes: %zu"
" present\n", __func__, bytes); " present\n", __func__, bytes);
return -EINVAL; return -EINVAL;

View File

@ -3621,10 +3621,15 @@ static int check_direct_read(struct btrfs_fs_info *fs_info,
if (!iter_is_iovec(iter)) if (!iter_is_iovec(iter))
return 0; return 0;
for (seg = 0; seg < iter->nr_segs; seg++) for (seg = 0; seg < iter->nr_segs; seg++) {
for (i = seg + 1; i < iter->nr_segs; i++) for (i = seg + 1; i < iter->nr_segs; i++) {
if (iter->iov[seg].iov_base == iter->iov[i].iov_base) const struct iovec *iov1 = iter_iov(iter) + seg;
const struct iovec *iov2 = iter_iov(iter) + i;
if (iov1->iov_base == iov2->iov_base)
return -EINVAL; return -EINVAL;
}
}
return 0; return 0;
} }

View File

@ -446,7 +446,7 @@ bailout:
static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
{ {
return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS; return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -ENOSYS;
} }
static unsigned long cramfs_physmem_get_unmapped_area(struct file *file, static unsigned long cramfs_physmem_get_unmapped_area(struct file *file,

View File

@ -153,8 +153,6 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
path_noexec(&file->f_path))) path_noexec(&file->f_path)))
goto exit; goto exit;
fsnotify_open(file);
error = -ENOEXEC; error = -ENOEXEC;
read_lock(&binfmt_lock); read_lock(&binfmt_lock);
@ -939,9 +937,6 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
if (err) if (err)
goto exit; goto exit;
if (name->name[0] != '\0')
fsnotify_open(file);
out: out:
return file; return file;

View File

@ -902,7 +902,8 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
return ret; return ret;
} }
filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC |
FMODE_DIO_PARALLEL_WRITE;
return dquot_file_open(inode, filp); return dquot_file_open(inode, filp);
} }

View File

@ -236,7 +236,6 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
retval = PTR_ERR(file); retval = PTR_ERR(file);
} else { } else {
retval = fd; retval = fd;
fsnotify_open(file);
fd_install(fd, file); fd_install(fd, file);
} }
path_put(&path); path_put(&path);

View File

@ -1370,7 +1370,7 @@ out:
static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
{ {
return (unsigned long)ii->iov->iov_base + ii->iov_offset; return (unsigned long)iter_iov(ii)->iov_base + ii->iov_offset;
} }
static inline size_t fuse_get_frag_size(const struct iov_iter *ii, static inline size_t fuse_get_frag_size(const struct iov_iter *ii,

View File

@ -19,10 +19,12 @@
* Private flags for iomap_dio, must not overlap with the public ones in * Private flags for iomap_dio, must not overlap with the public ones in
* iomap.h: * iomap.h:
*/ */
#define IOMAP_DIO_WRITE_FUA (1 << 28) #define IOMAP_DIO_CALLER_COMP (1U << 26)
#define IOMAP_DIO_NEED_SYNC (1 << 29) #define IOMAP_DIO_INLINE_COMP (1U << 27)
#define IOMAP_DIO_WRITE (1 << 30) #define IOMAP_DIO_WRITE_THROUGH (1U << 28)
#define IOMAP_DIO_DIRTY (1 << 31) #define IOMAP_DIO_NEED_SYNC (1U << 29)
#define IOMAP_DIO_WRITE (1U << 30)
#define IOMAP_DIO_DIRTY (1U << 31)
struct iomap_dio { struct iomap_dio {
struct kiocb *iocb; struct kiocb *iocb;
@ -40,7 +42,6 @@ struct iomap_dio {
struct { struct {
struct iov_iter *iter; struct iov_iter *iter;
struct task_struct *waiter; struct task_struct *waiter;
struct bio *poll_bio;
} submit; } submit;
/* used for aio completion: */ /* used for aio completion: */
@ -53,12 +54,14 @@ struct iomap_dio {
static void iomap_dio_submit_bio(const struct iomap_iter *iter, static void iomap_dio_submit_bio(const struct iomap_iter *iter,
struct iomap_dio *dio, struct bio *bio, loff_t pos) struct iomap_dio *dio, struct bio *bio, loff_t pos)
{ {
struct kiocb *iocb = dio->iocb;
atomic_inc(&dio->ref); atomic_inc(&dio->ref);
/* Sync dio can't be polled reliably */ /* Sync dio can't be polled reliably */
if ((dio->iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(dio->iocb)) { if ((iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(iocb)) {
bio_set_polled(bio, dio->iocb); bio_set_polled(bio, iocb);
dio->submit.poll_bio = bio; WRITE_ONCE(iocb->private, bio);
} }
if (dio->dops && dio->dops->submit_io) if (dio->dops && dio->dops->submit_io)
@ -126,6 +129,11 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
} }
EXPORT_SYMBOL_GPL(iomap_dio_complete); EXPORT_SYMBOL_GPL(iomap_dio_complete);
static ssize_t iomap_dio_deferred_complete(void *data)
{
return iomap_dio_complete(data);
}
static void iomap_dio_complete_work(struct work_struct *work) static void iomap_dio_complete_work(struct work_struct *work)
{ {
struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
@ -148,27 +156,69 @@ static void iomap_dio_bio_end_io(struct bio *bio)
{ {
struct iomap_dio *dio = bio->bi_private; struct iomap_dio *dio = bio->bi_private;
bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
struct kiocb *iocb = dio->iocb;
if (bio->bi_status) if (bio->bi_status)
iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
if (!atomic_dec_and_test(&dio->ref))
goto release_bio;
if (atomic_dec_and_test(&dio->ref)) { /*
if (dio->wait_for_completion) { * Synchronous dio, task itself will handle any completion work
struct task_struct *waiter = dio->submit.waiter; * that needs after IO. All we need to do is wake the task.
WRITE_ONCE(dio->submit.waiter, NULL); */
blk_wake_io_task(waiter); if (dio->wait_for_completion) {
} else if (dio->flags & IOMAP_DIO_WRITE) { struct task_struct *waiter = dio->submit.waiter;
struct inode *inode = file_inode(dio->iocb->ki_filp);
WRITE_ONCE(dio->iocb->private, NULL); WRITE_ONCE(dio->submit.waiter, NULL);
INIT_WORK(&dio->aio.work, iomap_dio_complete_work); blk_wake_io_task(waiter);
queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); goto release_bio;
} else {
WRITE_ONCE(dio->iocb->private, NULL);
iomap_dio_complete_work(&dio->aio.work);
}
} }
/*
* Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline
*/
if (dio->flags & IOMAP_DIO_INLINE_COMP) {
WRITE_ONCE(iocb->private, NULL);
iomap_dio_complete_work(&dio->aio.work);
goto release_bio;
}
/*
* If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule
* our completion that way to avoid an async punt to a workqueue.
*/
if (dio->flags & IOMAP_DIO_CALLER_COMP) {
/* only polled IO cares about private cleared */
iocb->private = dio;
iocb->dio_complete = iomap_dio_deferred_complete;
/*
* Invoke ->ki_complete() directly. We've assigned our
* dio_complete callback handler, and since the issuer set
* IOCB_DIO_CALLER_COMP, we know their ki_complete handler will
* notice ->dio_complete being set and will defer calling that
* handler until it can be done from a safe task context.
*
* Note that the 'res' being passed in here is not important
* for this case. The actual completion value of the request
* will be gotten from dio_complete when that is run by the
* issuer.
*/
iocb->ki_complete(iocb, 0);
goto release_bio;
}
/*
* Async DIO completion that requires filesystem level completion work
* gets punted to a work queue to complete as the operation may require
* more IO to be issued to finalise filesystem metadata changes or
* guarantee data integrity.
*/
INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq,
&dio->aio.work);
release_bio:
if (should_dirty) { if (should_dirty) {
bio_check_pages_dirty(bio); bio_check_pages_dirty(bio);
} else { } else {
@ -197,7 +247,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
/* /*
* Figure out the bio's operation flags from the dio request, the * Figure out the bio's operation flags from the dio request, the
* mapping, and whether or not we want FUA. Note that we can end up * mapping, and whether or not we want FUA. Note that we can end up
* clearing the WRITE_FUA flag in the dio request. * clearing the WRITE_THROUGH flag in the dio request.
*/ */
static inline unsigned int iomap_dio_bio_opflags(struct iomap_dio *dio, static inline unsigned int iomap_dio_bio_opflags(struct iomap_dio *dio,
const struct iomap *iomap, bool use_fua) const struct iomap *iomap, bool use_fua)
@ -217,7 +267,7 @@ static inline unsigned int iomap_dio_bio_opflags(struct iomap_dio *dio,
if (use_fua) if (use_fua)
opflags |= REQ_FUA; opflags |= REQ_FUA;
else else
dio->flags &= ~IOMAP_DIO_WRITE_FUA; dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
return opflags; return opflags;
} }
@ -258,12 +308,19 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
* Use a FUA write if we need datasync semantics, this is a pure * Use a FUA write if we need datasync semantics, this is a pure
* data IO that doesn't require any metadata updates (including * data IO that doesn't require any metadata updates (including
* after IO completion such as unwritten extent conversion) and * after IO completion such as unwritten extent conversion) and
* the underlying device supports FUA. This allows us to avoid * the underlying device either supports FUA or doesn't have
* cache flushes on IO completion. * a volatile write cache. This allows us to avoid cache flushes
* on IO completion. If we can't use writethrough and need to
* sync, disable in-task completions as dio completion will
* need to call generic_write_sync() which will do a blocking
* fsync / cache flush call.
*/ */
if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
(dio->flags & IOMAP_DIO_WRITE_FUA) && bdev_fua(iomap->bdev)) (dio->flags & IOMAP_DIO_WRITE_THROUGH) &&
(bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev)))
use_fua = true; use_fua = true;
else if (dio->flags & IOMAP_DIO_NEED_SYNC)
dio->flags &= ~IOMAP_DIO_CALLER_COMP;
} }
/* /*
@ -278,10 +335,23 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
goto out; goto out;
/* /*
* We can only poll for single bio I/Os. * We can only do deferred completion for pure overwrites that
* don't require additional IO at completion. This rules out
* writes that need zeroing or extent conversion, extend
* the file size, or issue journal IO or cache flushes
* during completion processing.
*/ */
if (need_zeroout || if (need_zeroout ||
((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) ||
((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
dio->flags &= ~IOMAP_DIO_CALLER_COMP;
/*
* The rules for polled IO completions follow the guidelines as the
* ones we set for inline and deferred completions. If none of those
* are available for this IO, clear the polled flag.
*/
if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP)))
dio->iocb->ki_flags &= ~IOCB_HIPRI; dio->iocb->ki_flags &= ~IOCB_HIPRI;
if (need_zeroout) { if (need_zeroout) {
@ -502,9 +572,11 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->submit.iter = iter; dio->submit.iter = iter;
dio->submit.waiter = current; dio->submit.waiter = current;
dio->submit.poll_bio = NULL;
if (iov_iter_rw(iter) == READ) { if (iov_iter_rw(iter) == READ) {
/* reads can always complete inline */
dio->flags |= IOMAP_DIO_INLINE_COMP;
if (iomi.pos >= dio->i_size) if (iomi.pos >= dio->i_size)
goto out_free_dio; goto out_free_dio;
@ -523,6 +595,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
iomi.flags |= IOMAP_WRITE; iomi.flags |= IOMAP_WRITE;
dio->flags |= IOMAP_DIO_WRITE; dio->flags |= IOMAP_DIO_WRITE;
/*
* Flag as supporting deferred completions, if the issuer
* groks it. This can avoid a workqueue punt for writes.
* We may later clear this flag if we need to do other IO
* as part of this IO completion.
*/
if (iocb->ki_flags & IOCB_DIO_CALLER_COMP)
dio->flags |= IOMAP_DIO_CALLER_COMP;
if (iocb->ki_flags & IOCB_NOWAIT) { if (iocb->ki_flags & IOCB_NOWAIT) {
if (filemap_range_has_page(mapping, iomi.pos, end)) { if (filemap_range_has_page(mapping, iomi.pos, end)) {
ret = -EAGAIN; ret = -EAGAIN;
@ -536,13 +617,16 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->flags |= IOMAP_DIO_NEED_SYNC; dio->flags |= IOMAP_DIO_NEED_SYNC;
/* /*
* For datasync only writes, we optimistically try using FUA for * For datasync only writes, we optimistically try using
* this IO. Any non-FUA write that occurs will clear this flag, * WRITE_THROUGH for this IO. This flag requires either
* hence we know before completion whether a cache flush is * FUA writes through the device's write cache, or a
* necessary. * normal write to a device without a volatile write
* cache. For the former, Any non-FUA write that occurs
* will clear this flag, hence we know before completion
* whether a cache flush is necessary.
*/ */
if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC) if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
dio->flags |= IOMAP_DIO_WRITE_FUA; dio->flags |= IOMAP_DIO_WRITE_THROUGH;
} }
if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
@ -615,14 +699,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
iomap_dio_set_error(dio, ret); iomap_dio_set_error(dio, ret);
/* /*
* If all the writes we issued were FUA, we don't need to flush the * If all the writes we issued were already written through to the
* cache on IO completion. Clear the sync flag for this case. * media, we don't need to flush the cache on IO completion. Clear the
* sync flag for this case.
*/ */
if (dio->flags & IOMAP_DIO_WRITE_FUA) if (dio->flags & IOMAP_DIO_WRITE_THROUGH)
dio->flags &= ~IOMAP_DIO_NEED_SYNC; dio->flags &= ~IOMAP_DIO_NEED_SYNC;
WRITE_ONCE(iocb->private, dio->submit.poll_bio);
/* /*
* We are about to drop our additional submission reference, which * We are about to drop our additional submission reference, which
* might be the last reference to the dio. There are three different * might be the last reference to the dio. There are three different

View File

@ -886,6 +886,11 @@ static int do_dentry_open(struct file *f,
truncate_pagecache(inode, 0); truncate_pagecache(inode, 0);
} }
/*
* Once we return a file with FMODE_OPENED, __fput() will call
* fsnotify_close(), so we need fsnotify_open() here for symmetry.
*/
fsnotify_open(f);
return 0; return 0;
cleanup_all: cleanup_all:
@ -1270,7 +1275,6 @@ static long do_sys_openat2(int dfd, const char __user *filename,
put_unused_fd(fd); put_unused_fd(fd);
fd = PTR_ERR(f); fd = PTR_ERR(f);
} else { } else {
fsnotify_open(f);
fd_install(fd, f); fd_install(fd, f);
} }
} }

View File

@ -390,6 +390,12 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
if (!ovl_should_sync(OVL_FS(inode->i_sb))) if (!ovl_should_sync(OVL_FS(inode->i_sb)))
ifl &= ~(IOCB_DSYNC | IOCB_SYNC); ifl &= ~(IOCB_DSYNC | IOCB_SYNC);
/*
* Overlayfs doesn't support deferred completions, don't copy
* this property in case it is set by the issuer.
*/
ifl &= ~IOCB_DIO_CALLER_COMP;
old_cred = ovl_override_creds(file_inode(file)->i_sb); old_cred = ovl_override_creds(file_inode(file)->i_sb);
if (is_sync_kiocb(iocb)) { if (is_sync_kiocb(iocb)) {
file_start_write(real.file); file_start_write(real.file);

View File

@ -40,7 +40,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
} }
if (atomic_read(&mm->mm_count) > 1 || if (atomic_read(&mm->mm_count) > 1 ||
vma->vm_flags & VM_MAYSHARE) { is_nommu_shared_mapping(vma->vm_flags)) {
sbytes += size; sbytes += size;
} else { } else {
bytes += size; bytes += size;

View File

@ -264,7 +264,7 @@ out:
*/ */
static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma) static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
{ {
if (!(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) if (!is_nommu_shared_mapping(vma->vm_flags))
return -ENOSYS; return -ENOSYS;
file_accessed(file); file_accessed(file);

View File

@ -749,15 +749,14 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
return -EOPNOTSUPP; return -EOPNOTSUPP;
while (iov_iter_count(iter)) { while (iov_iter_count(iter)) {
struct iovec iovec = iov_iter_iovec(iter);
ssize_t nr; ssize_t nr;
if (type == READ) { if (type == READ) {
nr = filp->f_op->read(filp, iovec.iov_base, nr = filp->f_op->read(filp, iter_iov_addr(iter),
iovec.iov_len, ppos); iter_iov_len(iter), ppos);
} else { } else {
nr = filp->f_op->write(filp, iovec.iov_base, nr = filp->f_op->write(filp, iter_iov_addr(iter),
iovec.iov_len, ppos); iter_iov_len(iter), ppos);
} }
if (nr < 0) { if (nr < 0) {
@ -766,7 +765,7 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
break; break;
} }
ret += nr; ret += nr;
if (nr != iovec.iov_len) if (nr != iter_iov_len(iter))
break; break;
iov_iter_advance(iter, nr); iov_iter_advance(iter, nr);
} }

View File

@ -63,7 +63,7 @@ static unsigned long romfs_get_unmapped_area(struct file *file,
*/ */
static int romfs_mmap(struct file *file, struct vm_area_struct *vma) static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
{ {
return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS; return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -ENOSYS;
} }
static unsigned romfs_mmap_capabilities(struct file *file) static unsigned romfs_mmap_capabilities(struct file *file)

View File

@ -1171,7 +1171,8 @@ xfs_file_open(
{ {
if (xfs_is_shutdown(XFS_M(inode->i_sb))) if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO; return -EIO;
file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC; file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
FMODE_DIO_PARALLEL_WRITE;
return generic_file_open(inode, file); return generic_file_open(inode, file);
} }

View File

@ -159,6 +159,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
/* File supports DIRECT IO */ /* File supports DIRECT IO */
#define FMODE_CAN_ODIRECT ((__force fmode_t)0x400000) #define FMODE_CAN_ODIRECT ((__force fmode_t)0x400000)
/* File supports non-exclusive O_DIRECT writes from multiple threads */
#define FMODE_DIO_PARALLEL_WRITE ((__force fmode_t)0x1000000)
/* File was opened by fanotify and shouldn't generate fanotify events */ /* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY ((__force fmode_t)0x4000000) #define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
@ -312,20 +315,60 @@ enum rw_hint {
#define IOCB_NOIO (1 << 20) #define IOCB_NOIO (1 << 20)
/* can use bio alloc cache */ /* can use bio alloc cache */
#define IOCB_ALLOC_CACHE (1 << 21) #define IOCB_ALLOC_CACHE (1 << 21)
/*
* IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the
* iocb completion can be passed back to the owner for execution from a safe
* context rather than needing to be punted through a workqueue. If this
* flag is set, the bio completion handling may set iocb->dio_complete to a
* handler function and iocb->private to context information for that handler.
* The issuer should call the handler with that context information from task
* context to complete the processing of the iocb. Note that while this
* provides a task context for the dio_complete() callback, it should only be
* used on the completion side for non-IO generating completions. It's fine to
* call blocking functions from this callback, but they should not wait for
* unrelated IO (like cache flushing, new IO generation, etc).
*/
#define IOCB_DIO_CALLER_COMP (1 << 22)
/* for use in trace events */
#define TRACE_IOCB_STRINGS \
{ IOCB_HIPRI, "HIPRI" }, \
{ IOCB_DSYNC, "DSYNC" }, \
{ IOCB_SYNC, "SYNC" }, \
{ IOCB_NOWAIT, "NOWAIT" }, \
{ IOCB_APPEND, "APPEND" }, \
{ IOCB_EVENTFD, "EVENTFD"}, \
{ IOCB_DIRECT, "DIRECT" }, \
{ IOCB_WRITE, "WRITE" }, \
{ IOCB_WAITQ, "WAITQ" }, \
{ IOCB_NOIO, "NOIO" }, \
{ IOCB_ALLOC_CACHE, "ALLOC_CACHE" }, \
{ IOCB_DIO_CALLER_COMP, "CALLER_COMP" }
struct kiocb { struct kiocb {
struct file *ki_filp; struct file *ki_filp;
/* The 'ki_filp' pointer is shared in a union for aio */
randomized_struct_fields_start
loff_t ki_pos; loff_t ki_pos;
void (*ki_complete)(struct kiocb *iocb, long ret); void (*ki_complete)(struct kiocb *iocb, long ret);
void *private; void *private;
int ki_flags; int ki_flags;
u16 ki_ioprio; /* See linux/ioprio.h */ u16 ki_ioprio; /* See linux/ioprio.h */
struct wait_page_queue *ki_waitq; /* for async buffered IO */ union {
randomized_struct_fields_end /*
* Only used for async buffered reads, where it denotes the
* page waitqueue associated with completing the read. Valid
* IFF IOCB_WAITQ is set.
*/
struct wait_page_queue *ki_waitq;
/*
* Can be used for O_DIRECT IO, where the completion handling
* is punted back to the issuer of the IO. May only be set
* if IOCB_DIO_CALLER_COMP is set by the issuer, and the issuer
* must then check for presence of this handler when ki_complete
* is invoked. The data passed in to this handler must be
* assigned to ->private when dio_complete is assigned.
*/
ssize_t (*dio_complete)(void *data);
};
}; };
static inline bool is_sync_kiocb(struct kiocb *kiocb) static inline bool is_sync_kiocb(struct kiocb *kiocb)

View File

@ -24,7 +24,7 @@ enum io_uring_cmd_flags {
struct io_uring_cmd { struct io_uring_cmd {
struct file *file; struct file *file;
const void *cmd; const struct io_uring_sqe *sqe;
union { union {
/* callback to defer completions to task context */ /* callback to defer completions to task context */
void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned); void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned);
@ -36,18 +36,33 @@ struct io_uring_cmd {
u8 pdu[32]; /* available inline for free use */ u8 pdu[32]; /* available inline for free use */
}; };
static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
{
return sqe->cmd;
}
#if defined(CONFIG_IO_URING) #if defined(CONFIG_IO_URING)
int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct iov_iter *iter, void *ioucmd); struct iov_iter *iter, void *ioucmd);
void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2, void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2,
unsigned issue_flags); unsigned issue_flags);
void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned));
struct sock *io_uring_get_socket(struct file *file); struct sock *io_uring_get_socket(struct file *file);
void __io_uring_cancel(bool cancel_all); void __io_uring_cancel(bool cancel_all);
void __io_uring_free(struct task_struct *tsk); void __io_uring_free(struct task_struct *tsk);
void io_uring_unreg_ringfd(void); void io_uring_unreg_ringfd(void);
const char *io_uring_get_opcode(u8 opcode); const char *io_uring_get_opcode(u8 opcode);
void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned),
unsigned flags);
/* users should follow semantics of IOU_F_TWQ_LAZY_WAKE */
void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned));
static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned))
{
__io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0);
}
static inline void io_uring_files_cancel(void) static inline void io_uring_files_cancel(void)
{ {
@ -66,6 +81,7 @@ static inline void io_uring_free(struct task_struct *tsk)
if (tsk->io_uring) if (tsk->io_uring)
__io_uring_free(tsk); __io_uring_free(tsk);
} }
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
#else #else
static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct iov_iter *iter, void *ioucmd) struct iov_iter *iter, void *ioucmd)
@ -80,6 +96,10 @@ static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned)) void (*task_work_cb)(struct io_uring_cmd *, unsigned))
{ {
} }
static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned))
{
}
static inline struct sock *io_uring_get_socket(struct file *file) static inline struct sock *io_uring_get_socket(struct file *file)
{ {
return NULL; return NULL;
@ -97,6 +117,11 @@ static inline const char *io_uring_get_opcode(u8 opcode)
{ {
return ""; return "";
} }
static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
return -EOPNOTSUPP;
}
#endif #endif
#endif #endif

View File

@ -58,7 +58,7 @@ struct io_uring_task {
struct xarray xa; struct xarray xa;
struct wait_queue_head wait; struct wait_queue_head wait;
atomic_t in_idle; atomic_t in_cancel;
atomic_t inflight_tracked; atomic_t inflight_tracked;
struct percpu_counter inflight; struct percpu_counter inflight;
@ -69,8 +69,8 @@ struct io_uring_task {
}; };
struct io_uring { struct io_uring {
u32 head ____cacheline_aligned_in_smp; u32 head;
u32 tail ____cacheline_aligned_in_smp; u32 tail;
}; };
/* /*
@ -176,7 +176,6 @@ struct io_submit_state {
unsigned short submit_nr; unsigned short submit_nr;
unsigned int cqes_count; unsigned int cqes_count;
struct blk_plug plug; struct blk_plug plug;
struct io_uring_cqe cqes[16];
}; };
struct io_ev_fd { struct io_ev_fd {
@ -188,28 +187,34 @@ struct io_ev_fd {
}; };
struct io_alloc_cache { struct io_alloc_cache {
struct hlist_head list; struct io_wq_work_node list;
unsigned int nr_cached; unsigned int nr_cached;
unsigned int max_cached;
size_t elem_size;
}; };
struct io_ring_ctx { struct io_ring_ctx {
/* const or read-mostly hot data */ /* const or read-mostly hot data */
struct { struct {
struct percpu_ref refs;
struct io_rings *rings;
unsigned int flags; unsigned int flags;
enum task_work_notify_mode notify_method;
unsigned int compat: 1;
unsigned int drain_next: 1; unsigned int drain_next: 1;
unsigned int restricted: 1; unsigned int restricted: 1;
unsigned int off_timeout_used: 1; unsigned int off_timeout_used: 1;
unsigned int drain_active: 1; unsigned int drain_active: 1;
unsigned int drain_disabled: 1;
unsigned int has_evfd: 1; unsigned int has_evfd: 1;
unsigned int syscall_iopoll: 1;
/* all CQEs should be posted only by the submitter task */ /* all CQEs should be posted only by the submitter task */
unsigned int task_complete: 1; unsigned int task_complete: 1;
unsigned int lockless_cq: 1;
unsigned int syscall_iopoll: 1;
unsigned int poll_activated: 1;
unsigned int drain_disabled: 1;
unsigned int compat: 1;
struct task_struct *submitter_task;
struct io_rings *rings;
struct percpu_ref refs;
enum task_work_notify_mode notify_method;
} ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp;
/* submission data */ /* submission data */
@ -237,7 +242,6 @@ struct io_ring_ctx {
* uring_lock, and updated through io_uring_register(2) * uring_lock, and updated through io_uring_register(2)
*/ */
struct io_rsrc_node *rsrc_node; struct io_rsrc_node *rsrc_node;
int rsrc_cached_refs;
atomic_t cancel_seq; atomic_t cancel_seq;
struct io_file_table file_table; struct io_file_table file_table;
unsigned nr_user_files; unsigned nr_user_files;
@ -248,32 +252,21 @@ struct io_ring_ctx {
struct io_buffer_list *io_bl; struct io_buffer_list *io_bl;
struct xarray io_bl_xa; struct xarray io_bl_xa;
struct list_head io_buffers_cache;
struct io_hash_table cancel_table_locked; struct io_hash_table cancel_table_locked;
struct list_head cq_overflow_list;
struct io_alloc_cache apoll_cache; struct io_alloc_cache apoll_cache;
struct io_alloc_cache netmsg_cache; struct io_alloc_cache netmsg_cache;
/*
* ->iopoll_list is protected by the ctx->uring_lock for
* io_uring instances that don't use IORING_SETUP_SQPOLL.
* For SQPOLL, only the single threaded io_sq_thread() will
* manipulate the list, hence no extra locking is needed there.
*/
struct io_wq_work_list iopoll_list;
bool poll_multi_queue;
} ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp;
/* IRQ completion list, under ->completion_lock */
struct io_wq_work_list locked_free_list;
unsigned int locked_free_nr;
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
struct io_sq_data *sq_data; /* if using sq thread polling */
struct wait_queue_head sqo_sq_wait;
struct list_head sqd_list;
unsigned long check_cq;
unsigned int file_alloc_start;
unsigned int file_alloc_end;
struct xarray personalities;
u32 pers_next;
struct { struct {
/* /*
* We cache a range of free CQEs we can use, once exhausted it * We cache a range of free CQEs we can use, once exhausted it
@ -285,54 +278,69 @@ struct io_ring_ctx {
unsigned cached_cq_tail; unsigned cached_cq_tail;
unsigned cq_entries; unsigned cq_entries;
struct io_ev_fd __rcu *io_ev_fd; struct io_ev_fd __rcu *io_ev_fd;
struct wait_queue_head cq_wait;
unsigned cq_extra; unsigned cq_extra;
} ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp;
/*
* task_work and async notification delivery cacheline. Expected to
* regularly bounce b/w CPUs.
*/
struct { struct {
spinlock_t completion_lock;
bool poll_multi_queue;
/*
* ->iopoll_list is protected by the ctx->uring_lock for
* io_uring instances that don't use IORING_SETUP_SQPOLL.
* For SQPOLL, only the single threaded io_sq_thread() will
* manipulate the list, hence no extra locking is needed there.
*/
struct io_wq_work_list iopoll_list;
struct io_hash_table cancel_table;
struct llist_head work_llist; struct llist_head work_llist;
unsigned long check_cq;
struct list_head io_buffers_comp; atomic_t cq_wait_nr;
atomic_t cq_timeouts;
struct wait_queue_head cq_wait;
} ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp;
/* timeouts */ /* timeouts */
struct { struct {
spinlock_t timeout_lock; spinlock_t timeout_lock;
atomic_t cq_timeouts;
struct list_head timeout_list; struct list_head timeout_list;
struct list_head ltimeout_list; struct list_head ltimeout_list;
unsigned cq_last_tm_flush; unsigned cq_last_tm_flush;
} ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp;
/* Keep this last, we don't need it for the fast path */ struct io_uring_cqe completion_cqes[16];
spinlock_t completion_lock;
/* IRQ completion list, under ->completion_lock */
struct io_wq_work_list locked_free_list;
unsigned int locked_free_nr;
struct list_head io_buffers_comp;
struct list_head cq_overflow_list;
struct io_hash_table cancel_table;
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
struct io_sq_data *sq_data; /* if using sq thread polling */
struct wait_queue_head sqo_sq_wait;
struct list_head sqd_list;
unsigned int file_alloc_start;
unsigned int file_alloc_end;
struct xarray personalities;
u32 pers_next;
struct list_head io_buffers_cache;
/* Keep this last, we don't need it for the fast path */
struct wait_queue_head poll_wq;
struct io_restriction restrictions; struct io_restriction restrictions;
struct task_struct *submitter_task;
/* slow path rsrc auxilary data, used by update/register */ /* slow path rsrc auxilary data, used by update/register */
struct io_rsrc_node *rsrc_backup_node;
struct io_mapped_ubuf *dummy_ubuf; struct io_mapped_ubuf *dummy_ubuf;
struct io_rsrc_data *file_data; struct io_rsrc_data *file_data;
struct io_rsrc_data *buf_data; struct io_rsrc_data *buf_data;
struct delayed_work rsrc_put_work; /* protected by ->uring_lock */
struct callback_head rsrc_put_tw;
struct llist_head rsrc_put_llist;
struct list_head rsrc_ref_list; struct list_head rsrc_ref_list;
spinlock_t rsrc_ref_lock; struct io_alloc_cache rsrc_node_cache;
struct wait_queue_head rsrc_quiesce_wq;
unsigned rsrc_quiesce;
struct list_head io_buffers_pages; struct list_head io_buffers_pages;
@ -357,10 +365,25 @@ struct io_ring_ctx {
u32 iowq_limits[2]; u32 iowq_limits[2];
bool iowq_limits_set; bool iowq_limits_set;
struct callback_head poll_wq_task_work;
struct list_head defer_list; struct list_head defer_list;
unsigned sq_thread_idle; unsigned sq_thread_idle;
/* protected by ->completion_lock */ /* protected by ->completion_lock */
unsigned evfd_last_cq_tail; unsigned evfd_last_cq_tail;
/*
* If IORING_SETUP_NO_MMAP is used, then the below holds
* the gup'ed pages for the two rings, and the sqes.
*/
unsigned short n_ring_pages;
unsigned short n_sqe_pages;
struct page **ring_pages;
struct page **sqe_pages;
};
struct io_tw_state {
/* ->uring_lock is taken, callbacks can use io_tw_lock to lock it */
bool locked;
}; };
enum { enum {
@ -391,7 +414,6 @@ enum {
REQ_F_SINGLE_POLL_BIT, REQ_F_SINGLE_POLL_BIT,
REQ_F_DOUBLE_POLL_BIT, REQ_F_DOUBLE_POLL_BIT,
REQ_F_PARTIAL_IO_BIT, REQ_F_PARTIAL_IO_BIT,
REQ_F_CQE32_INIT_BIT,
REQ_F_APOLL_MULTISHOT_BIT, REQ_F_APOLL_MULTISHOT_BIT,
REQ_F_CLEAR_POLLIN_BIT, REQ_F_CLEAR_POLLIN_BIT,
REQ_F_HASH_LOCKED_BIT, REQ_F_HASH_LOCKED_BIT,
@ -461,15 +483,13 @@ enum {
REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
/* fast poll multishot mode */ /* fast poll multishot mode */
REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT),
/* ->extra1 and ->extra2 are initialised */
REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT),
/* recvmsg special flag, clear EPOLLIN */ /* recvmsg special flag, clear EPOLLIN */
REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT),
/* hashed into ->cancel_hash_locked, protected by ->uring_lock */ /* hashed into ->cancel_hash_locked, protected by ->uring_lock */
REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT), REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT),
}; };
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
struct io_task_work { struct io_task_work {
struct llist_node node; struct llist_node node;
@ -559,14 +579,9 @@ struct io_kiocb {
atomic_t refs; atomic_t refs;
atomic_t poll_refs; atomic_t poll_refs;
struct io_task_work io_task_work; struct io_task_work io_task_work;
unsigned nr_tw;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
union { struct hlist_node hash_node;
struct hlist_node hash_node;
struct {
u64 extra1;
u64 extra2;
};
};
/* internal polling, see IORING_FEAT_FAST_POLL */ /* internal polling, see IORING_FEAT_FAST_POLL */
struct async_poll *apoll; struct async_poll *apoll;
/* opcode allocated if it needs to store data for async defer */ /* opcode allocated if it needs to store data for async defer */
@ -576,6 +591,11 @@ struct io_kiocb {
/* custom credentials, valid IFF REQ_F_CREDS is set */ /* custom credentials, valid IFF REQ_F_CREDS is set */
const struct cred *creds; const struct cred *creds;
struct io_wq_work work; struct io_wq_work work;
struct {
u64 extra1;
u64 extra2;
} big_cqe;
}; };
struct io_overflow_cqe { struct io_overflow_cqe {

View File

@ -1279,6 +1279,21 @@ static inline bool is_cow_mapping(vm_flags_t flags)
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
} }
#ifndef CONFIG_MMU
static inline bool is_nommu_shared_mapping(vm_flags_t flags)
{
/*
* NOMMU shared mappings are ordinary MAP_SHARED mappings and selected
* R/O MAP_PRIVATE file mappings that are an effective R/O overlay of
* a file mapping. R/O MAP_PRIVATE mappings might still modify
* underlying memory if ptrace is active, so this is only possible if
* ptrace does not apply. Note that there is no mprotect() to upgrade
* write permissions later.
*/
return flags & VM_MAYSHARE;
}
#endif
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define SECTION_IN_PAGE_FLAGS #define SECTION_IN_PAGE_FLAGS
#endif #endif

View File

@ -18,10 +18,11 @@ static inline int ip_mroute_opt(int opt)
int ip_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int); int ip_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int);
int ip_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t); int ip_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t);
int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg); int ipmr_ioctl(struct sock *sk, int cmd, void *arg);
int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
int ip_mr_init(void); int ip_mr_init(void);
bool ipmr_rule_default(const struct fib_rule *rule); bool ipmr_rule_default(const struct fib_rule *rule);
int ipmr_sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
#else #else
static inline int ip_mroute_setsockopt(struct sock *sock, int optname, static inline int ip_mroute_setsockopt(struct sock *sock, int optname,
sockptr_t optval, unsigned int optlen) sockptr_t optval, unsigned int optlen)
@ -35,7 +36,7 @@ static inline int ip_mroute_getsockopt(struct sock *sk, int optname,
return -ENOPROTOOPT; return -ENOPROTOOPT;
} }
static inline int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) static inline int ipmr_ioctl(struct sock *sk, int cmd, void *arg)
{ {
return -ENOIOCTLCMD; return -ENOIOCTLCMD;
} }
@ -54,6 +55,12 @@ static inline bool ipmr_rule_default(const struct fib_rule *rule)
{ {
return true; return true;
} }
static inline int ipmr_sk_ioctl(struct sock *sk, unsigned int cmd,
void __user *arg)
{
return 1;
}
#endif #endif
#define VIFF_STATIC 0x8000 #define VIFF_STATIC 0x8000

View File

@ -29,10 +29,10 @@ struct sock;
extern int ip6_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int); extern int ip6_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int);
extern int ip6_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t); extern int ip6_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t);
extern int ip6_mr_input(struct sk_buff *skb); extern int ip6_mr_input(struct sk_buff *skb);
extern int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg);
extern int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); extern int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
extern int ip6_mr_init(void); extern int ip6_mr_init(void);
extern void ip6_mr_cleanup(void); extern void ip6_mr_cleanup(void);
int ip6mr_ioctl(struct sock *sk, int cmd, void *arg);
#else #else
static inline int ip6_mroute_setsockopt(struct sock *sock, int optname, static inline int ip6_mroute_setsockopt(struct sock *sock, int optname,
sockptr_t optval, unsigned int optlen) sockptr_t optval, unsigned int optlen)
@ -48,7 +48,7 @@ int ip6_mroute_getsockopt(struct sock *sock,
} }
static inline static inline
int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg) int ip6mr_ioctl(struct sock *sk, int cmd, void *arg)
{ {
return -ENOIOCTLCMD; return -ENOIOCTLCMD;
} }
@ -100,6 +100,27 @@ extern int ip6mr_get_route(struct net *net, struct sk_buff *skb,
#ifdef CONFIG_IPV6_MROUTE #ifdef CONFIG_IPV6_MROUTE
bool mroute6_is_socket(struct net *net, struct sk_buff *skb); bool mroute6_is_socket(struct net *net, struct sk_buff *skb);
extern int ip6mr_sk_done(struct sock *sk); extern int ip6mr_sk_done(struct sock *sk);
static inline int ip6mr_sk_ioctl(struct sock *sk, unsigned int cmd,
void __user *arg)
{
switch (cmd) {
/* These userspace buffers will be consumed by ip6mr_ioctl() */
case SIOCGETMIFCNT_IN6: {
struct sioc_mif_req6 buffer;
return sock_ioctl_inout(sk, cmd, arg, &buffer,
sizeof(buffer));
}
case SIOCGETSGCNT_IN6: {
struct sioc_sg_req6 buffer;
return sock_ioctl_inout(sk, cmd, arg, &buffer,
sizeof(buffer));
}
}
return 1;
}
#else #else
static inline bool mroute6_is_socket(struct net *net, struct sk_buff *skb) static inline bool mroute6_is_socket(struct net *net, struct sk_buff *skb)
{ {
@ -109,5 +130,11 @@ static inline int ip6mr_sk_done(struct sock *sk)
{ {
return 0; return 0;
} }
static inline int ip6mr_sk_ioctl(struct sock *sk, unsigned int cmd,
void __user *arg)
{
return 1;
}
#endif #endif
#endif #endif

View File

@ -320,6 +320,7 @@ struct ucred {
*/ */
#define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ #define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */
#define MSG_SPLICE_PAGES 0x8000000 /* Splice the pages from the iterator in sendmsg() */
#define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */
#define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file #define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file
descriptor received through descriptor received through
@ -330,6 +331,8 @@ struct ucred {
#define MSG_CMSG_COMPAT 0 /* We never have 32 bit fixups */ #define MSG_CMSG_COMPAT 0 /* We never have 32 bit fixups */
#endif #endif
/* Flags to be cleared on entry by sendmsg and sendmmsg syscalls */
#define MSG_INTERNAL_SENDMSG_FLAGS (MSG_SPLICE_PAGES)
/* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */ /* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */
#define SOL_IP 0 #define SOL_IP 0

View File

@ -49,7 +49,8 @@ struct iov_iter {
}; };
size_t count; size_t count;
union { union {
const struct iovec *iov; /* use iter_iov() to get the current vec */
const struct iovec *__iov;
const struct kvec *kvec; const struct kvec *kvec;
const struct bio_vec *bvec; const struct bio_vec *bvec;
struct xarray *xarray; struct xarray *xarray;
@ -66,6 +67,10 @@ struct iov_iter {
}; };
}; };
#define iter_iov(iter) (iter)->__iov
#define iter_iov_addr(iter) (iter_iov(iter)->iov_base + (iter)->iov_offset)
#define iter_iov_len(iter) (iter_iov(iter)->iov_len - (iter)->iov_offset)
static inline enum iter_type iov_iter_type(const struct iov_iter *i) static inline enum iter_type iov_iter_type(const struct iov_iter *i)
{ {
return i->iter_type; return i->iter_type;
@ -141,15 +146,6 @@ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs)
return ret; return ret;
} }
static inline struct iovec iov_iter_iovec(const struct iov_iter *iter)
{
return (struct iovec) {
.iov_base = iter->iov->iov_base + iter->iov_offset,
.iov_len = min(iter->count,
iter->iov->iov_len - iter->iov_offset),
};
}
size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t copy_page_from_iter_atomic(struct page *page, unsigned offset,
size_t bytes, struct iov_iter *i); size_t bytes, struct iov_iter *i);
void iov_iter_advance(struct iov_iter *i, size_t bytes); void iov_iter_advance(struct iov_iter *i, size_t bytes);
@ -343,6 +339,7 @@ ssize_t __import_iovec(int type, const struct iovec __user *uvec,
struct iov_iter *i, bool compat); struct iov_iter *i, bool compat);
int import_single_range(int type, void __user *buf, size_t len, int import_single_range(int type, void __user *buf, size_t len,
struct iovec *iov, struct iov_iter *i); struct iovec *iov, struct iov_iter *i);
int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i);
static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction, static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
void __user *buf, size_t count) void __user *buf, size_t count)

View File

@ -109,4 +109,25 @@ void phonet_sysctl_exit(void);
int isi_register(void); int isi_register(void);
void isi_unregister(void); void isi_unregister(void);
static inline bool sk_is_phonet(struct sock *sk)
{
return sk->sk_family == PF_PHONET;
}
static inline int phonet_sk_ioctl(struct sock *sk, unsigned int cmd,
void __user *arg)
{
int karg;
switch (cmd) {
case SIOCPNADDRESOURCE:
case SIOCPNDELRESOURCE:
if (get_user(karg, (int __user *)arg))
return -EFAULT;
return sk->sk_prot->ioctl(sk, cmd, &karg);
}
/* A positive return value means that the ioctl was not processed */
return 1;
}
#endif #endif

View File

@ -1228,7 +1228,7 @@ struct proto {
bool kern); bool kern);
int (*ioctl)(struct sock *sk, int cmd, int (*ioctl)(struct sock *sk, int cmd,
unsigned long arg); int *karg);
int (*init)(struct sock *sk); int (*init)(struct sock *sk);
void (*destroy)(struct sock *sk); void (*destroy)(struct sock *sk);
void (*shutdown)(struct sock *sk, int how); void (*shutdown)(struct sock *sk, int how);
@ -2972,6 +2972,9 @@ int sock_get_timeout(long timeo, void *optval, bool old_timeval);
int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
sockptr_t optval, int optlen, bool old_timeval); sockptr_t optval, int optlen, bool old_timeval);
int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
void __user *arg, void *karg, size_t size);
int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
static inline bool sk_is_readable(struct sock *sk) static inline bool sk_is_readable(struct sock *sk)
{ {
if (sk->sk_prot->sock_is_readable) if (sk->sk_prot->sock_is_readable)

View File

@ -342,7 +342,7 @@ void tcp_release_cb(struct sock *sk);
void tcp_wfree(struct sk_buff *skb); void tcp_wfree(struct sk_buff *skb);
void tcp_write_timer_handler(struct sock *sk); void tcp_write_timer_handler(struct sock *sk);
void tcp_delack_timer_handler(struct sock *sk); void tcp_delack_timer_handler(struct sock *sk);
int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg); int tcp_ioctl(struct sock *sk, int cmd, int *karg);
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
void tcp_rcv_space_adjust(struct sock *sk); void tcp_rcv_space_adjust(struct sock *sk);

View File

@ -284,7 +284,7 @@ void udp_flush_pending_frames(struct sock *sk);
int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size); int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size);
void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst);
int udp_rcv(struct sk_buff *skb); int udp_rcv(struct sk_buff *skb);
int udp_ioctl(struct sock *sk, int cmd, unsigned long arg); int udp_ioctl(struct sock *sk, int cmd, int *karg);
int udp_init_sock(struct sock *sk); int udp_init_sock(struct sock *sk);
int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
int __udp_disconnect(struct sock *sk, int flags); int __udp_disconnect(struct sock *sk, int flags);

View File

@ -360,19 +360,18 @@ TRACE_EVENT(io_uring_complete,
); );
/** /**
* io_uring_submit_sqe - called before submitting one SQE * io_uring_submit_req - called before submitting a request
* *
* @req: pointer to a submitted request * @req: pointer to a submitted request
* @force_nonblock: whether a context blocking or not
* *
* Allows to track SQE submitting, to understand what was the source of it, SQ * Allows to track SQE submitting, to understand what was the source of it, SQ
* thread or io_uring_enter call. * thread or io_uring_enter call.
*/ */
TRACE_EVENT(io_uring_submit_sqe, TRACE_EVENT(io_uring_submit_req,
TP_PROTO(struct io_kiocb *req, bool force_nonblock), TP_PROTO(struct io_kiocb *req),
TP_ARGS(req, force_nonblock), TP_ARGS(req),
TP_STRUCT__entry ( TP_STRUCT__entry (
__field( void *, ctx ) __field( void *, ctx )
@ -380,7 +379,6 @@ TRACE_EVENT(io_uring_submit_sqe,
__field( unsigned long long, user_data ) __field( unsigned long long, user_data )
__field( u8, opcode ) __field( u8, opcode )
__field( u32, flags ) __field( u32, flags )
__field( bool, force_nonblock )
__field( bool, sq_thread ) __field( bool, sq_thread )
__string( op_str, io_uring_get_opcode(req->opcode) ) __string( op_str, io_uring_get_opcode(req->opcode) )
@ -392,16 +390,15 @@ TRACE_EVENT(io_uring_submit_sqe,
__entry->user_data = req->cqe.user_data; __entry->user_data = req->cqe.user_data;
__entry->opcode = req->opcode; __entry->opcode = req->opcode;
__entry->flags = req->flags; __entry->flags = req->flags;
__entry->force_nonblock = force_nonblock;
__entry->sq_thread = req->ctx->flags & IORING_SETUP_SQPOLL; __entry->sq_thread = req->ctx->flags & IORING_SETUP_SQPOLL;
__assign_str(op_str, io_uring_get_opcode(req->opcode)); __assign_str(op_str, io_uring_get_opcode(req->opcode));
), ),
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, " TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, "
"non block %d, sq_thread %d", __entry->ctx, __entry->req, "sq_thread %d", __entry->ctx, __entry->req,
__entry->user_data, __get_str(op_str), __entry->user_data, __get_str(op_str),
__entry->flags, __entry->force_nonblock, __entry->sq_thread) __entry->flags, __entry->sq_thread)
); );
/* /*

View File

@ -173,6 +173,23 @@ enum {
*/ */
#define IORING_SETUP_DEFER_TASKRUN (1U << 13) #define IORING_SETUP_DEFER_TASKRUN (1U << 13)
/*
* Application provides the memory for the rings
*/
#define IORING_SETUP_NO_MMAP (1U << 14)
/*
* Register the ring fd in itself for use with
* IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather
* than an fd.
*/
#define IORING_SETUP_REGISTERED_FD_ONLY (1U << 15)
/*
* Removes indirection through the SQ index array.
*/
#define IORING_SETUP_NO_SQARRAY (1U << 16)
enum io_uring_op { enum io_uring_op {
IORING_OP_NOP, IORING_OP_NOP,
IORING_OP_READV, IORING_OP_READV,
@ -252,6 +269,7 @@ enum io_uring_op {
#define IORING_TIMEOUT_REALTIME (1U << 3) #define IORING_TIMEOUT_REALTIME (1U << 3)
#define IORING_LINK_TIMEOUT_UPDATE (1U << 4) #define IORING_LINK_TIMEOUT_UPDATE (1U << 4)
#define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5) #define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5)
#define IORING_TIMEOUT_MULTISHOT (1U << 6)
#define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME) #define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
#define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE) #define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
/* /*
@ -286,11 +304,15 @@ enum io_uring_op {
* request 'user_data' * request 'user_data'
* IORING_ASYNC_CANCEL_ANY Match any request * IORING_ASYNC_CANCEL_ANY Match any request
* IORING_ASYNC_CANCEL_FD_FIXED 'fd' passed in is a fixed descriptor * IORING_ASYNC_CANCEL_FD_FIXED 'fd' passed in is a fixed descriptor
* IORING_ASYNC_CANCEL_USERDATA Match on user_data, default for no other key
* IORING_ASYNC_CANCEL_OP Match request based on opcode
*/ */
#define IORING_ASYNC_CANCEL_ALL (1U << 0) #define IORING_ASYNC_CANCEL_ALL (1U << 0)
#define IORING_ASYNC_CANCEL_FD (1U << 1) #define IORING_ASYNC_CANCEL_FD (1U << 1)
#define IORING_ASYNC_CANCEL_ANY (1U << 2) #define IORING_ASYNC_CANCEL_ANY (1U << 2)
#define IORING_ASYNC_CANCEL_FD_FIXED (1U << 3) #define IORING_ASYNC_CANCEL_FD_FIXED (1U << 3)
#define IORING_ASYNC_CANCEL_USERDATA (1U << 4)
#define IORING_ASYNC_CANCEL_OP (1U << 5)
/* /*
* send/sendmsg and recv/recvmsg flags (sqe->ioprio) * send/sendmsg and recv/recvmsg flags (sqe->ioprio)
@ -349,6 +371,8 @@ enum {
* applicable for IORING_MSG_DATA, obviously. * applicable for IORING_MSG_DATA, obviously.
*/ */
#define IORING_MSG_RING_CQE_SKIP (1U << 0) #define IORING_MSG_RING_CQE_SKIP (1U << 0)
/* Pass through the flags from sqe->file_index to cqe->flags */
#define IORING_MSG_RING_FLAGS_PASS (1U << 1)
/* /*
* IO completion data structure (Completion Queue Entry) * IO completion data structure (Completion Queue Entry)
@ -389,6 +413,9 @@ enum {
#define IORING_OFF_SQ_RING 0ULL #define IORING_OFF_SQ_RING 0ULL
#define IORING_OFF_CQ_RING 0x8000000ULL #define IORING_OFF_CQ_RING 0x8000000ULL
#define IORING_OFF_SQES 0x10000000ULL #define IORING_OFF_SQES 0x10000000ULL
#define IORING_OFF_PBUF_RING 0x80000000ULL
#define IORING_OFF_PBUF_SHIFT 16
#define IORING_OFF_MMAP_MASK 0xf8000000ULL
/* /*
* Filled with the offset for mmap(2) * Filled with the offset for mmap(2)
@ -402,7 +429,7 @@ struct io_sqring_offsets {
__u32 dropped; __u32 dropped;
__u32 array; __u32 array;
__u32 resv1; __u32 resv1;
__u64 resv2; __u64 user_addr;
}; };
/* /*
@ -421,7 +448,7 @@ struct io_cqring_offsets {
__u32 cqes; __u32 cqes;
__u32 flags; __u32 flags;
__u32 resv1; __u32 resv1;
__u64 resv2; __u64 user_addr;
}; };
/* /*
@ -472,6 +499,7 @@ struct io_uring_params {
#define IORING_FEAT_RSRC_TAGS (1U << 10) #define IORING_FEAT_RSRC_TAGS (1U << 10)
#define IORING_FEAT_CQE_SKIP (1U << 11) #define IORING_FEAT_CQE_SKIP (1U << 11)
#define IORING_FEAT_LINKED_FILE (1U << 12) #define IORING_FEAT_LINKED_FILE (1U << 12)
#define IORING_FEAT_REG_REG_RING (1U << 13)
/* /*
* io_uring_register(2) opcodes and arguments * io_uring_register(2) opcodes and arguments
@ -519,7 +547,10 @@ enum {
IORING_REGISTER_FILE_ALLOC_RANGE = 25, IORING_REGISTER_FILE_ALLOC_RANGE = 25,
/* this goes last */ /* this goes last */
IORING_REGISTER_LAST IORING_REGISTER_LAST,
/* flag added to the opcode to use a registered ring fd */
IORING_REGISTER_USE_REGISTERED_RING = 1U << 31
}; };
/* io-wq worker categories */ /* io-wq worker categories */
@ -564,19 +595,6 @@ struct io_uring_rsrc_update2 {
__u32 resv2; __u32 resv2;
}; };
struct io_uring_notification_slot {
__u64 tag;
__u64 resv[3];
};
struct io_uring_notification_register {
__u32 nr_slots;
__u32 resv;
__u64 resv2;
__u64 data;
__u64 resv3;
};
/* Skip updating fd indexes set to this value in the fd table */ /* Skip updating fd indexes set to this value in the fd table */
#define IORING_REGISTER_FILES_SKIP (-2) #define IORING_REGISTER_FILES_SKIP (-2)
@ -631,12 +649,26 @@ struct io_uring_buf_ring {
}; };
}; };
/*
* Flags for IORING_REGISTER_PBUF_RING.
*
* IOU_PBUF_RING_MMAP: If set, kernel will allocate the memory for the ring.
* The application must not set a ring_addr in struct
* io_uring_buf_reg, instead it must subsequently call
* mmap(2) with the offset set as:
* IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT)
* to get a virtual mapping for the ring.
*/
enum {
IOU_PBUF_RING_MMAP = 1,
};
/* argument for IORING_(UN)REGISTER_PBUF_RING */ /* argument for IORING_(UN)REGISTER_PBUF_RING */
struct io_uring_buf_reg { struct io_uring_buf_reg {
__u64 ring_addr; __u64 ring_addr;
__u32 ring_entries; __u32 ring_entries;
__u16 bgid; __u16 bgid;
__u16 pad; __u16 flags;
__u64 resv[3]; __u64 resv[3];
}; };
@ -674,7 +706,9 @@ struct io_uring_sync_cancel_reg {
__s32 fd; __s32 fd;
__u32 flags; __u32 flags;
struct __kernel_timespec timeout; struct __kernel_timespec timeout;
__u64 pad[4]; __u8 opcode;
__u8 pad[7];
__u64 pad2[3];
}; };
/* /*
@ -694,6 +728,14 @@ struct io_uring_recvmsg_out {
__u32 flags; __u32 flags;
}; };
/*
* Argument for IORING_OP_URING_CMD when file is a socket
*/
enum {
SOCKET_URING_OP_SIOCINQ = 0,
SOCKET_URING_OP_SIOCOUTQ,
};
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -39,6 +39,7 @@ int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
ma->addr = READ_ONCE(sqe->addr); ma->addr = READ_ONCE(sqe->addr);
ma->len = READ_ONCE(sqe->len); ma->len = READ_ONCE(sqe->len);
ma->advice = READ_ONCE(sqe->fadvise_advice); ma->advice = READ_ONCE(sqe->fadvise_advice);
req->flags |= REQ_F_FORCE_ASYNC;
return 0; return 0;
#else #else
return -EOPNOTSUPP; return -EOPNOTSUPP;
@ -51,8 +52,7 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise); struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise);
int ret; int ret;
if (issue_flags & IO_URING_F_NONBLOCK) WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
return -EAGAIN;
ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
io_req_set_res(req, ret, 0); io_req_set_res(req, ret, 0);
@ -62,6 +62,18 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
#endif #endif
} }
static bool io_fadvise_force_async(struct io_fadvise *fa)
{
switch (fa->advice) {
case POSIX_FADV_NORMAL:
case POSIX_FADV_RANDOM:
case POSIX_FADV_SEQUENTIAL:
return false;
default:
return true;
}
}
int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{ {
struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise); struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise);
@ -72,6 +84,8 @@ int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
fa->offset = READ_ONCE(sqe->off); fa->offset = READ_ONCE(sqe->off);
fa->len = READ_ONCE(sqe->len); fa->len = READ_ONCE(sqe->len);
fa->advice = READ_ONCE(sqe->fadvise_advice); fa->advice = READ_ONCE(sqe->fadvise_advice);
if (io_fadvise_force_async(fa))
req->flags |= REQ_F_FORCE_ASYNC;
return 0; return 0;
} }
@ -80,16 +94,7 @@ int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise); struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise);
int ret; int ret;
if (issue_flags & IO_URING_F_NONBLOCK) { WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK && io_fadvise_force_async(fa));
switch (fa->advice) {
case POSIX_FADV_NORMAL:
case POSIX_FADV_RANDOM:
case POSIX_FADV_SEQUENTIAL:
break;
default:
return -EAGAIN;
}
}
ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
if (ret < 0) if (ret < 0)

View File

@ -7,47 +7,60 @@
#define IO_ALLOC_CACHE_MAX 512 #define IO_ALLOC_CACHE_MAX 512
struct io_cache_entry { struct io_cache_entry {
struct hlist_node node; struct io_wq_work_node node;
}; };
static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
struct io_cache_entry *entry) struct io_cache_entry *entry)
{ {
if (cache->nr_cached < IO_ALLOC_CACHE_MAX) { if (cache->nr_cached < cache->max_cached) {
cache->nr_cached++; cache->nr_cached++;
hlist_add_head(&entry->node, &cache->list); wq_stack_add_head(&entry->node, &cache->list);
/* KASAN poisons object */
kasan_slab_free_mempool(entry);
return true; return true;
} }
return false; return false;
} }
static inline bool io_alloc_cache_empty(struct io_alloc_cache *cache)
{
return !cache->list.next;
}
static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache) static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache)
{ {
if (!hlist_empty(&cache->list)) { if (cache->list.next) {
struct hlist_node *node = cache->list.first; struct io_cache_entry *entry;
hlist_del(node); entry = container_of(cache->list.next, struct io_cache_entry, node);
kasan_unpoison_range(entry, cache->elem_size);
cache->list.next = cache->list.next->next;
cache->nr_cached--; cache->nr_cached--;
return container_of(node, struct io_cache_entry, node); return entry;
} }
return NULL; return NULL;
} }
static inline void io_alloc_cache_init(struct io_alloc_cache *cache) static inline void io_alloc_cache_init(struct io_alloc_cache *cache,
unsigned max_nr, size_t size)
{ {
INIT_HLIST_HEAD(&cache->list); cache->list.next = NULL;
cache->nr_cached = 0; cache->nr_cached = 0;
cache->max_cached = max_nr;
cache->elem_size = size;
} }
static inline void io_alloc_cache_free(struct io_alloc_cache *cache, static inline void io_alloc_cache_free(struct io_alloc_cache *cache,
void (*free)(struct io_cache_entry *)) void (*free)(struct io_cache_entry *))
{ {
while (!hlist_empty(&cache->list)) { while (1) {
struct hlist_node *node = cache->list.first; struct io_cache_entry *entry = io_alloc_cache_get(cache);
hlist_del(node); if (!entry)
free(container_of(node, struct io_cache_entry, node)); break;
free(entry);
} }
cache->nr_cached = 0; cache->nr_cached = 0;
} }

View File

@ -22,33 +22,54 @@ struct io_cancel {
u64 addr; u64 addr;
u32 flags; u32 flags;
s32 fd; s32 fd;
u8 opcode;
}; };
#define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \ #define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \
IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED) IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED | \
IORING_ASYNC_CANCEL_USERDATA | IORING_ASYNC_CANCEL_OP)
/*
* Returns true if the request matches the criteria outlined by 'cd'.
*/
bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd)
{
bool match_user_data = cd->flags & IORING_ASYNC_CANCEL_USERDATA;
if (req->ctx != cd->ctx)
return false;
if (!(cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP)))
match_user_data = true;
if (cd->flags & IORING_ASYNC_CANCEL_ANY)
goto check_seq;
if (cd->flags & IORING_ASYNC_CANCEL_FD) {
if (req->file != cd->file)
return false;
}
if (cd->flags & IORING_ASYNC_CANCEL_OP) {
if (req->opcode != cd->opcode)
return false;
}
if (match_user_data && req->cqe.user_data != cd->data)
return false;
if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
check_seq:
if (cd->seq == req->work.cancel_seq)
return false;
req->work.cancel_seq = cd->seq;
}
return true;
}
static bool io_cancel_cb(struct io_wq_work *work, void *data) static bool io_cancel_cb(struct io_wq_work *work, void *data)
{ {
struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_kiocb *req = container_of(work, struct io_kiocb, work);
struct io_cancel_data *cd = data; struct io_cancel_data *cd = data;
if (req->ctx != cd->ctx) return io_cancel_req_match(req, cd);
return false;
if (cd->flags & IORING_ASYNC_CANCEL_ANY) {
;
} else if (cd->flags & IORING_ASYNC_CANCEL_FD) {
if (req->file != cd->file)
return false;
} else {
if (req->cqe.user_data != cd->data)
return false;
}
if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
if (cd->seq == req->work.cancel_seq)
return false;
req->work.cancel_seq = cd->seq;
}
return true;
} }
static int io_async_cancel_one(struct io_uring_task *tctx, static int io_async_cancel_one(struct io_uring_task *tctx,
@ -111,7 +132,7 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(req->flags & REQ_F_BUFFER_SELECT)) if (unlikely(req->flags & REQ_F_BUFFER_SELECT))
return -EINVAL; return -EINVAL;
if (sqe->off || sqe->len || sqe->splice_fd_in) if (sqe->off || sqe->splice_fd_in)
return -EINVAL; return -EINVAL;
cancel->addr = READ_ONCE(sqe->addr); cancel->addr = READ_ONCE(sqe->addr);
@ -123,6 +144,11 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EINVAL; return -EINVAL;
cancel->fd = READ_ONCE(sqe->fd); cancel->fd = READ_ONCE(sqe->fd);
} }
if (cancel->flags & IORING_ASYNC_CANCEL_OP) {
if (cancel->flags & IORING_ASYNC_CANCEL_ANY)
return -EINVAL;
cancel->opcode = READ_ONCE(sqe->len);
}
return 0; return 0;
} }
@ -169,6 +195,7 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
.ctx = req->ctx, .ctx = req->ctx,
.data = cancel->addr, .data = cancel->addr,
.flags = cancel->flags, .flags = cancel->flags,
.opcode = cancel->opcode,
.seq = atomic_inc_return(&req->ctx->cancel_seq), .seq = atomic_inc_return(&req->ctx->cancel_seq),
}; };
struct io_uring_task *tctx = req->task->io_uring; struct io_uring_task *tctx = req->task->io_uring;
@ -216,13 +243,10 @@ static int __io_sync_cancel(struct io_uring_task *tctx,
/* fixed must be grabbed every time since we drop the uring_lock */ /* fixed must be grabbed every time since we drop the uring_lock */
if ((cd->flags & IORING_ASYNC_CANCEL_FD) && if ((cd->flags & IORING_ASYNC_CANCEL_FD) &&
(cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) { (cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
unsigned long file_ptr;
if (unlikely(fd >= ctx->nr_user_files)) if (unlikely(fd >= ctx->nr_user_files))
return -EBADF; return -EBADF;
fd = array_index_nospec(fd, ctx->nr_user_files); fd = array_index_nospec(fd, ctx->nr_user_files);
file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; cd->file = io_file_from_index(&ctx->file_table, fd);
cd->file = (struct file *) (file_ptr & FFS_MASK);
if (!cd->file) if (!cd->file)
return -EBADF; return -EBADF;
} }
@ -241,17 +265,22 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
struct io_uring_sync_cancel_reg sc; struct io_uring_sync_cancel_reg sc;
struct fd f = { }; struct fd f = { };
DEFINE_WAIT(wait); DEFINE_WAIT(wait);
int ret; int ret, i;
if (copy_from_user(&sc, arg, sizeof(sc))) if (copy_from_user(&sc, arg, sizeof(sc)))
return -EFAULT; return -EFAULT;
if (sc.flags & ~CANCEL_FLAGS) if (sc.flags & ~CANCEL_FLAGS)
return -EINVAL; return -EINVAL;
if (sc.pad[0] || sc.pad[1] || sc.pad[2] || sc.pad[3]) for (i = 0; i < ARRAY_SIZE(sc.pad); i++)
return -EINVAL; if (sc.pad[i])
return -EINVAL;
for (i = 0; i < ARRAY_SIZE(sc.pad2); i++)
if (sc.pad2[i])
return -EINVAL;
cd.data = sc.addr; cd.data = sc.addr;
cd.flags = sc.flags; cd.flags = sc.flags;
cd.opcode = sc.opcode;
/* we can grab a normal file descriptor upfront */ /* we can grab a normal file descriptor upfront */
if ((cd.flags & IORING_ASYNC_CANCEL_FD) && if ((cd.flags & IORING_ASYNC_CANCEL_FD) &&

View File

@ -8,11 +8,11 @@ struct io_cancel_data {
u64 data; u64 data;
struct file *file; struct file *file;
}; };
u8 opcode;
u32 flags; u32 flags;
int seq; int seq;
}; };
int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags); int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags);
@ -21,3 +21,4 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
void init_hash_table(struct io_hash_table *table, unsigned size); void init_hash_table(struct io_hash_table *table, unsigned size);
int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg); int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg);
bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd);

View File

@ -48,10 +48,13 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
return 0; return 0;
} }
static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, /*
struct seq_file *m) * Caller holds a reference to the file already, we don't need to do
* anything else to get an extra reference.
*/
__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
{ {
struct io_sq_data *sq = NULL; struct io_ring_ctx *ctx = f->private_data;
struct io_overflow_cqe *ocqe; struct io_overflow_cqe *ocqe;
struct io_rings *r = ctx->rings; struct io_rings *r = ctx->rings;
unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1; unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
@ -62,6 +65,7 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
unsigned int cq_shift = 0; unsigned int cq_shift = 0;
unsigned int sq_shift = 0; unsigned int sq_shift = 0;
unsigned int sq_entries, cq_entries; unsigned int sq_entries, cq_entries;
int sq_pid = -1, sq_cpu = -1;
bool has_lock; bool has_lock;
unsigned int i; unsigned int i;
@ -91,6 +95,8 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
struct io_uring_sqe *sqe; struct io_uring_sqe *sqe;
unsigned int sq_idx; unsigned int sq_idx;
if (ctx->flags & IORING_SETUP_NO_SQARRAY)
break;
sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
if (sq_idx > sq_mask) if (sq_idx > sq_mask)
continue; continue;
@ -139,13 +145,19 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
has_lock = mutex_trylock(&ctx->uring_lock); has_lock = mutex_trylock(&ctx->uring_lock);
if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
sq = ctx->sq_data; struct io_sq_data *sq = ctx->sq_data;
if (!sq->thread)
sq = NULL; if (mutex_trylock(&sq->lock)) {
if (sq->thread) {
sq_pid = task_pid_nr(sq->thread);
sq_cpu = task_cpu(sq->thread);
}
mutex_unlock(&sq->lock);
}
} }
seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1); seq_printf(m, "SqThread:\t%d\n", sq_pid);
seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1); seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu);
seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
for (i = 0; has_lock && i < ctx->nr_user_files; i++) { for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
struct file *f = io_file_from_index(&ctx->file_table, i); struct file *f = io_file_from_index(&ctx->file_table, i);
@ -205,14 +217,4 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
spin_unlock(&ctx->completion_lock); spin_unlock(&ctx->completion_lock);
} }
__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
{
struct io_ring_ctx *ctx = f->private_data;
if (percpu_ref_tryget(&ctx->refs)) {
__io_uring_show_fdinfo(ctx, m);
percpu_ref_put(&ctx->refs);
}
}
#endif #endif

View File

@ -64,7 +64,6 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
u32 slot_index) u32 slot_index)
__must_hold(&req->ctx->uring_lock) __must_hold(&req->ctx->uring_lock)
{ {
bool needs_switch = false;
struct io_fixed_file *file_slot; struct io_fixed_file *file_slot;
int ret; int ret;
@ -79,20 +78,13 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
if (file_slot->file_ptr) { if (file_slot->file_ptr) {
struct file *old_file;
ret = io_rsrc_node_switch_start(ctx);
if (ret)
goto err;
old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
ret = io_queue_rsrc_removal(ctx->file_data, slot_index, ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
ctx->rsrc_node, old_file); io_slot_file(file_slot));
if (ret) if (ret)
goto err; return ret;
file_slot->file_ptr = 0; file_slot->file_ptr = 0;
io_file_bitmap_clear(&ctx->file_table, slot_index); io_file_bitmap_clear(&ctx->file_table, slot_index);
needs_switch = true;
} }
ret = io_scm_file_account(ctx, file); ret = io_scm_file_account(ctx, file);
@ -101,9 +93,6 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
io_fixed_file_set(file_slot, file); io_fixed_file_set(file_slot, file);
io_file_bitmap_set(&ctx->file_table, slot_index); io_file_bitmap_set(&ctx->file_table, slot_index);
} }
err:
if (needs_switch)
io_rsrc_node_switch(ctx, ctx->file_data);
return ret; return ret;
} }
@ -149,30 +138,25 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset) int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
{ {
struct io_fixed_file *file_slot; struct io_fixed_file *file_slot;
struct file *file;
int ret; int ret;
if (unlikely(!ctx->file_data)) if (unlikely(!ctx->file_data))
return -ENXIO; return -ENXIO;
if (offset >= ctx->nr_user_files) if (offset >= ctx->nr_user_files)
return -EINVAL; return -EINVAL;
ret = io_rsrc_node_switch_start(ctx);
if (ret)
return ret;
offset = array_index_nospec(offset, ctx->nr_user_files); offset = array_index_nospec(offset, ctx->nr_user_files);
file_slot = io_fixed_file_slot(&ctx->file_table, offset); file_slot = io_fixed_file_slot(&ctx->file_table, offset);
if (!file_slot->file_ptr) if (!file_slot->file_ptr)
return -EBADF; return -EBADF;
file = (struct file *)(file_slot->file_ptr & FFS_MASK); ret = io_queue_rsrc_removal(ctx->file_data, offset,
ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file); io_slot_file(file_slot));
if (ret) if (ret)
return ret; return ret;
file_slot->file_ptr = 0; file_slot->file_ptr = 0;
io_file_bitmap_clear(&ctx->file_table, offset); io_file_bitmap_clear(&ctx->file_table, offset);
io_rsrc_node_switch(ctx, ctx->file_data);
return 0; return 0;
} }

View File

@ -5,10 +5,6 @@
#include <linux/file.h> #include <linux/file.h>
#include <linux/io_uring_types.h> #include <linux/io_uring_types.h>
#define FFS_NOWAIT 0x1UL
#define FFS_ISREG 0x2UL
#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG)
bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files); bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files);
void io_free_file_tables(struct io_file_table *table); void io_free_file_tables(struct io_file_table *table);
@ -43,21 +39,31 @@ io_fixed_file_slot(struct io_file_table *table, unsigned i)
return &table->files[i]; return &table->files[i];
} }
#define FFS_NOWAIT 0x1UL
#define FFS_ISREG 0x2UL
#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG)
static inline unsigned int io_slot_flags(struct io_fixed_file *slot)
{
return (slot->file_ptr & ~FFS_MASK) << REQ_F_SUPPORT_NOWAIT_BIT;
}
static inline struct file *io_slot_file(struct io_fixed_file *slot)
{
return (struct file *)(slot->file_ptr & FFS_MASK);
}
static inline struct file *io_file_from_index(struct io_file_table *table, static inline struct file *io_file_from_index(struct io_file_table *table,
int index) int index)
{ {
struct io_fixed_file *slot = io_fixed_file_slot(table, index); return io_slot_file(io_fixed_file_slot(table, index));
return (struct file *) (slot->file_ptr & FFS_MASK);
} }
static inline void io_fixed_file_set(struct io_fixed_file *file_slot, static inline void io_fixed_file_set(struct io_fixed_file *file_slot,
struct file *file) struct file *file)
{ {
unsigned long file_ptr = (unsigned long) file; file_slot->file_ptr = (unsigned long)file |
(io_file_get_flags(file) >> REQ_F_SUPPORT_NOWAIT_BIT);
file_ptr |= io_file_get_flags(file);
file_slot->file_ptr = file_ptr;
} }
static inline void io_reset_alloc_hint(struct io_ring_ctx *ctx) static inline void io_reset_alloc_hint(struct io_ring_ctx *ctx)

View File

@ -74,6 +74,7 @@ int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
} }
req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_NEED_CLEANUP;
req->flags |= REQ_F_FORCE_ASYNC;
return 0; return 0;
} }
@ -82,8 +83,7 @@ int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename); struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename);
int ret; int ret;
if (issue_flags & IO_URING_F_NONBLOCK) WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
return -EAGAIN;
ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
ren->newpath, ren->flags); ren->newpath, ren->flags);
@ -123,6 +123,7 @@ int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return PTR_ERR(un->filename); return PTR_ERR(un->filename);
req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_NEED_CLEANUP;
req->flags |= REQ_F_FORCE_ASYNC;
return 0; return 0;
} }
@ -131,8 +132,7 @@ int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
struct io_unlink *un = io_kiocb_to_cmd(req, struct io_unlink); struct io_unlink *un = io_kiocb_to_cmd(req, struct io_unlink);
int ret; int ret;
if (issue_flags & IO_URING_F_NONBLOCK) WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
return -EAGAIN;
if (un->flags & AT_REMOVEDIR) if (un->flags & AT_REMOVEDIR)
ret = do_rmdir(un->dfd, un->filename); ret = do_rmdir(un->dfd, un->filename);
@ -170,6 +170,7 @@ int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return PTR_ERR(mkd->filename); return PTR_ERR(mkd->filename);
req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_NEED_CLEANUP;
req->flags |= REQ_F_FORCE_ASYNC;
return 0; return 0;
} }
@ -178,8 +179,7 @@ int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
struct io_mkdir *mkd = io_kiocb_to_cmd(req, struct io_mkdir); struct io_mkdir *mkd = io_kiocb_to_cmd(req, struct io_mkdir);
int ret; int ret;
if (issue_flags & IO_URING_F_NONBLOCK) WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
return -EAGAIN;
ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
@ -220,6 +220,7 @@ int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
} }
req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_NEED_CLEANUP;
req->flags |= REQ_F_FORCE_ASYNC;
return 0; return 0;
} }
@ -228,8 +229,7 @@ int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
struct io_link *sl = io_kiocb_to_cmd(req, struct io_link); struct io_link *sl = io_kiocb_to_cmd(req, struct io_link);
int ret; int ret;
if (issue_flags & IO_URING_F_NONBLOCK) WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
return -EAGAIN;
ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath); ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
@ -243,7 +243,7 @@ int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_link *lnk = io_kiocb_to_cmd(req, struct io_link); struct io_link *lnk = io_kiocb_to_cmd(req, struct io_link);
const char __user *oldf, *newf; const char __user *oldf, *newf;
if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL; return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE)) if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF; return -EBADF;
@ -265,6 +265,7 @@ int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
} }
req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_NEED_CLEANUP;
req->flags |= REQ_F_FORCE_ASYNC;
return 0; return 0;
} }
@ -273,8 +274,7 @@ int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
struct io_link *lnk = io_kiocb_to_cmd(req, struct io_link); struct io_link *lnk = io_kiocb_to_cmd(req, struct io_link);
int ret; int ret;
if (issue_flags & IO_URING_F_NONBLOCK) WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
return -EAGAIN;
ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd, ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd,
lnk->newpath, lnk->flags); lnk->newpath, lnk->flags);

File diff suppressed because it is too large Load Diff

View File

@ -50,8 +50,9 @@ void io_wq_put_and_exit(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_hash_work(struct io_wq_work *work, void *val); void io_wq_hash_work(struct io_wq_work *work, void *val);
int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask); int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask);
int io_wq_max_workers(struct io_wq *wq, int *new_count); int io_wq_max_workers(struct io_wq *wq, int *new_count);
bool io_wq_worker_stopped(void);
static inline bool io_wq_is_hashed(struct io_wq_work *work) static inline bool io_wq_is_hashed(struct io_wq_work *work)
{ {

File diff suppressed because it is too large Load Diff

View File

@ -4,6 +4,7 @@
#include <linux/errno.h> #include <linux/errno.h>
#include <linux/lockdep.h> #include <linux/lockdep.h>
#include <linux/resume_user_mode.h> #include <linux/resume_user_mode.h>
#include <linux/kasan.h>
#include <linux/io_uring_types.h> #include <linux/io_uring_types.h>
#include <uapi/linux/eventpoll.h> #include <uapi/linux/eventpoll.h>
#include "io-wq.h" #include "io-wq.h"
@ -14,6 +15,17 @@
#include <trace/events/io_uring.h> #include <trace/events/io_uring.h>
#endif #endif
enum {
/*
* A hint to not wake right away but delay until there are enough of
* tw's queued to match the number of CQEs the task is waiting for.
*
* Must not be used wirh requests generating more than one CQE.
* It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set.
*/
IOU_F_TWQ_LAZY_WAKE = 1,
};
enum { enum {
IOU_OK = 0, IOU_OK = 0,
IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED, IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED,
@ -26,16 +38,13 @@ enum {
IOU_STOP_MULTISHOT = -ECANCELED, IOU_STOP_MULTISHOT = -ECANCELED,
}; };
struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow); bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow);
bool io_req_cqe_overflow(struct io_kiocb *req); void io_req_cqe_overflow(struct io_kiocb *req);
int io_run_task_work_sig(struct io_ring_ctx *ctx); int io_run_task_work_sig(struct io_ring_ctx *ctx);
int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked);
int io_run_local_work(struct io_ring_ctx *ctx);
void io_req_defer_failed(struct io_kiocb *req, s32 res); void io_req_defer_failed(struct io_kiocb *req, s32 res);
void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags); void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags);
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags, bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags);
bool allow_overflow);
void __io_commit_cqring_flush(struct io_ring_ctx *ctx); void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
@ -44,28 +53,26 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd);
struct file *io_file_get_fixed(struct io_kiocb *req, int fd, struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned issue_flags); unsigned issue_flags);
static inline bool io_req_ffs_set(struct io_kiocb *req) void __io_req_task_work_add(struct io_kiocb *req, unsigned flags);
{
return req->flags & REQ_F_FIXED_FILE;
}
void __io_req_task_work_add(struct io_kiocb *req, bool allow_local);
bool io_is_uring_fops(struct file *file); bool io_is_uring_fops(struct file *file);
bool io_alloc_async_data(struct io_kiocb *req); bool io_alloc_async_data(struct io_kiocb *req);
void io_req_task_queue(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req);
void io_queue_iowq(struct io_kiocb *req, bool *dont_use); void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use);
void io_req_task_complete(struct io_kiocb *req, bool *locked); void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts);
void io_req_task_queue_fail(struct io_kiocb *req, int ret); void io_req_task_queue_fail(struct io_kiocb *req, int ret);
void io_req_task_submit(struct io_kiocb *req, bool *locked); void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts);
void tctx_task_work(struct callback_head *cb); void tctx_task_work(struct callback_head *cb);
__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
int io_uring_alloc_task_context(struct task_struct *task, int io_uring_alloc_task_context(struct task_struct *task,
struct io_ring_ctx *ctx); struct io_ring_ctx *ctx);
int io_poll_issue(struct io_kiocb *req, bool *locked); int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
int start, int end);
int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts);
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node); void __io_submit_flush_completions(struct io_ring_ctx *ctx);
int io_req_prep_async(struct io_kiocb *req); int io_req_prep_async(struct io_kiocb *req);
struct io_wq_work *io_wq_free_work(struct io_wq_work *work); struct io_wq_work *io_wq_free_work(struct io_wq_work *work);
@ -73,61 +80,73 @@ void io_wq_submit_work(struct io_wq_work *work);
void io_free_req(struct io_kiocb *req); void io_free_req(struct io_kiocb *req);
void io_queue_next(struct io_kiocb *req); void io_queue_next(struct io_kiocb *req);
void __io_put_task(struct task_struct *task, int nr);
void io_task_refs_refill(struct io_uring_task *tctx); void io_task_refs_refill(struct io_uring_task *tctx);
bool __io_alloc_req_refill(struct io_ring_ctx *ctx); bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
bool cancel_all); bool cancel_all);
#define io_lockdep_assert_cq_locked(ctx) \ #if defined(CONFIG_PROVE_LOCKING)
do { \ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
if (ctx->flags & IORING_SETUP_IOPOLL) { \ {
lockdep_assert_held(&ctx->uring_lock); \ lockdep_assert(in_task());
} else if (!ctx->task_complete) { \
lockdep_assert_held(&ctx->completion_lock); \ if (ctx->flags & IORING_SETUP_IOPOLL) {
} else if (ctx->submitter_task->flags & PF_EXITING) { \ lockdep_assert_held(&ctx->uring_lock);
lockdep_assert(current_work()); \ } else if (!ctx->task_complete) {
} else { \ lockdep_assert_held(&ctx->completion_lock);
lockdep_assert(current == ctx->submitter_task); \ } else if (ctx->submitter_task) {
} \ /*
} while (0) * ->submitter_task may be NULL and we can still post a CQE,
* if the ring has been setup with IORING_SETUP_R_DISABLED.
* Not from an SQE, as those cannot be submitted, but via
* updating tagged resources.
*/
if (ctx->submitter_task->flags & PF_EXITING)
lockdep_assert(current_work());
else
lockdep_assert(current == ctx->submitter_task);
}
}
#else
static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
{
}
#endif
static inline void io_req_task_work_add(struct io_kiocb *req) static inline void io_req_task_work_add(struct io_kiocb *req)
{ {
__io_req_task_work_add(req, true); __io_req_task_work_add(req, 0);
} }
#define io_for_each_link(pos, head) \ #define io_for_each_link(pos, head) \
for (pos = (head); pos; pos = pos->link) for (pos = (head); pos; pos = pos->link)
void io_cq_unlock_post(struct io_ring_ctx *ctx); static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx,
struct io_uring_cqe **ret,
static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx, bool overflow)
bool overflow)
{ {
io_lockdep_assert_cq_locked(ctx); io_lockdep_assert_cq_locked(ctx);
if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) {
struct io_uring_cqe *cqe = ctx->cqe_cached; if (unlikely(!io_cqe_cache_refill(ctx, overflow)))
return false;
ctx->cached_cq_tail++;
ctx->cqe_cached++;
if (ctx->flags & IORING_SETUP_CQE32)
ctx->cqe_cached++;
return cqe;
} }
*ret = ctx->cqe_cached;
return __io_get_cqe(ctx, overflow); ctx->cached_cq_tail++;
ctx->cqe_cached++;
if (ctx->flags & IORING_SETUP_CQE32)
ctx->cqe_cached++;
return true;
} }
static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret)
{ {
return io_get_cqe_overflow(ctx, false); return io_get_cqe_overflow(ctx, ret, false);
} }
static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
struct io_kiocb *req) struct io_kiocb *req)
{ {
struct io_uring_cqe *cqe; struct io_uring_cqe *cqe;
@ -136,39 +155,22 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
* submission (by quite a lot). Increment the overflow count in * submission (by quite a lot). Increment the overflow count in
* the ring. * the ring.
*/ */
cqe = io_get_cqe(ctx); if (unlikely(!io_get_cqe(ctx, &cqe)))
if (unlikely(!cqe))
return false; return false;
trace_io_uring_complete(req->ctx, req, req->cqe.user_data, if (trace_io_uring_complete_enabled())
req->cqe.res, req->cqe.flags, trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
(req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0, req->cqe.res, req->cqe.flags,
(req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0); req->big_cqe.extra1, req->big_cqe.extra2);
memcpy(cqe, &req->cqe, sizeof(*cqe)); memcpy(cqe, &req->cqe, sizeof(*cqe));
if (ctx->flags & IORING_SETUP_CQE32) { if (ctx->flags & IORING_SETUP_CQE32) {
u64 extra1 = 0, extra2 = 0; memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe));
memset(&req->big_cqe, 0, sizeof(req->big_cqe));
if (req->flags & REQ_F_CQE32_INIT) {
extra1 = req->extra1;
extra2 = req->extra2;
}
WRITE_ONCE(cqe->big_cqe[0], extra1);
WRITE_ONCE(cqe->big_cqe[1], extra2);
} }
return true; return true;
} }
static inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
struct io_kiocb *req)
{
if (likely(__io_fill_cqe_req(ctx, req)))
return true;
return io_req_cqe_overflow(req);
}
static inline void req_set_fail(struct io_kiocb *req) static inline void req_set_fail(struct io_kiocb *req)
{ {
req->flags |= REQ_F_FAIL; req->flags |= REQ_F_FAIL;
@ -189,10 +191,10 @@ static inline bool req_has_async_data(struct io_kiocb *req)
return req->flags & REQ_F_ASYNC_DATA; return req->flags & REQ_F_ASYNC_DATA;
} }
static inline void io_put_file(struct file *file) static inline void io_put_file(struct io_kiocb *req)
{ {
if (file) if (!(req->flags & REQ_F_FIXED_FILE) && req->file)
fput(file); fput(req->file);
} }
static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx, static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx,
@ -223,8 +225,14 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx)
smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
} }
/* requires smb_mb() prior, see wq_has_sleeper() */ static inline void io_poll_wq_wake(struct io_ring_ctx *ctx)
static inline void __io_cqring_wake(struct io_ring_ctx *ctx) {
if (wq_has_sleeper(&ctx->poll_wq))
__wake_up(&ctx->poll_wq, TASK_NORMAL, 0,
poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
}
static inline void io_cqring_wake(struct io_ring_ctx *ctx)
{ {
/* /*
* Trigger waitqueue handler on all waiters on our waitqueue. This * Trigger waitqueue handler on all waiters on our waitqueue. This
@ -236,17 +244,11 @@ static inline void __io_cqring_wake(struct io_ring_ctx *ctx)
* waitqueue handlers, we know we have a dependency between eventfd or * waitqueue handlers, we know we have a dependency between eventfd or
* epoll and should terminate multishot poll at that point. * epoll and should terminate multishot poll at that point.
*/ */
if (waitqueue_active(&ctx->cq_wait)) if (wq_has_sleeper(&ctx->cq_wait))
__wake_up(&ctx->cq_wait, TASK_NORMAL, 0, __wake_up(&ctx->cq_wait, TASK_NORMAL, 0,
poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
} }
static inline void io_cqring_wake(struct io_ring_ctx *ctx)
{
smp_mb();
__io_cqring_wake(ctx);
}
static inline bool io_sqring_full(struct io_ring_ctx *ctx) static inline bool io_sqring_full(struct io_ring_ctx *ctx)
{ {
struct io_rings *r = ctx->rings; struct io_rings *r = ctx->rings;
@ -257,9 +259,11 @@ static inline bool io_sqring_full(struct io_ring_ctx *ctx)
static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
{ {
struct io_rings *rings = ctx->rings; struct io_rings *rings = ctx->rings;
unsigned int entries;
/* make sure SQ entry isn't read before tail */ /* make sure SQ entry isn't read before tail */
return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; entries = smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
return min(entries, ctx->sq_entries);
} }
static inline int io_run_task_work(void) static inline int io_run_task_work(void)
@ -294,47 +298,11 @@ static inline bool io_task_work_pending(struct io_ring_ctx *ctx)
return task_work_pending(current) || !wq_list_empty(&ctx->work_llist); return task_work_pending(current) || !wq_list_empty(&ctx->work_llist);
} }
static inline int io_run_task_work_ctx(struct io_ring_ctx *ctx) static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts)
{ {
int ret = 0; if (!ts->locked) {
int ret2;
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
ret = io_run_local_work(ctx);
/* want to run this after in case more is added */
ret2 = io_run_task_work();
/* Try propagate error in favour of if tasks were run,
* but still make sure to run them if requested
*/
if (ret >= 0)
ret += ret2;
return ret;
}
static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)
{
bool locked;
int ret;
if (llist_empty(&ctx->work_llist))
return 0;
locked = true;
ret = __io_run_local_work(ctx, &locked);
/* shouldn't happen! */
if (WARN_ON_ONCE(!locked))
mutex_lock(&ctx->uring_lock); mutex_lock(&ctx->uring_lock);
return ret; ts->locked = true;
}
static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
{
if (!*locked) {
mutex_lock(&ctx->uring_lock);
*locked = true;
} }
} }
@ -355,19 +323,11 @@ static inline void io_req_complete_defer(struct io_kiocb *req)
static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
{ {
if (unlikely(ctx->off_timeout_used || ctx->drain_active || ctx->has_evfd)) if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
ctx->has_evfd || ctx->poll_activated))
__io_commit_cqring_flush(ctx); __io_commit_cqring_flush(ctx);
} }
/* must to be called somewhat shortly after putting a request */
static inline void io_put_task(struct task_struct *task, int nr)
{
if (likely(task == current))
task->io_uring->cached_refs += nr;
else
__io_put_task(task, nr);
}
static inline void io_get_task_refs(int nr) static inline void io_get_task_refs(int nr)
{ {
struct io_uring_task *tctx = current->io_uring; struct io_uring_task *tctx = current->io_uring;
@ -382,19 +342,30 @@ static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
return !ctx->submit_state.free_list.next; return !ctx->submit_state.free_list.next;
} }
static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx) extern struct kmem_cache *req_cachep;
static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx)
{ {
if (unlikely(io_req_cache_empty(ctx))) struct io_kiocb *req;
return __io_alloc_req_refill(ctx);
req = container_of(ctx->submit_state.free_list.next, struct io_kiocb, comp_list);
wq_stack_extract(&ctx->submit_state.free_list);
return req;
}
static inline bool io_alloc_req(struct io_ring_ctx *ctx, struct io_kiocb **req)
{
if (unlikely(io_req_cache_empty(ctx))) {
if (!__io_alloc_req_refill(ctx))
return false;
}
*req = io_extract_req(ctx);
return true; return true;
} }
static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) static inline bool io_allowed_defer_tw_run(struct io_ring_ctx *ctx)
{ {
struct io_wq_work_node *node; return likely(ctx->submitter_task == current);
node = wq_stack_extract(&ctx->submit_state.free_list);
return container_of(node, struct io_kiocb, comp_list);
} }
static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx) static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx)
@ -410,4 +381,14 @@ static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res)
io_req_task_work_add(req); io_req_task_work_add(req);
} }
/*
* IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each
* slot.
*/
static inline size_t uring_sqe_size(struct io_ring_ctx *ctx)
{
if (ctx->flags & IORING_SETUP_SQE128)
return 2 * sizeof(struct io_uring_sqe);
return sizeof(struct io_uring_sqe);
}
#endif #endif

View File

@ -137,7 +137,8 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
return NULL; return NULL;
head &= bl->mask; head &= bl->mask;
if (head < IO_BUFFER_LIST_BUF_PER_PAGE) { /* mmaped buffers are always contig */
if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) {
buf = &br->bufs[head]; buf = &br->bufs[head];
} else { } else {
int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
@ -179,7 +180,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
bl = io_buffer_get_list(ctx, req->buf_index); bl = io_buffer_get_list(ctx, req->buf_index);
if (likely(bl)) { if (likely(bl)) {
if (bl->buf_nr_pages) if (bl->is_mapped)
ret = io_ring_buffer_select(req, len, bl, issue_flags); ret = io_ring_buffer_select(req, len, bl, issue_flags);
else else
ret = io_provided_buffer_select(req, len, bl); ret = io_provided_buffer_select(req, len, bl);
@ -214,17 +215,24 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
if (!nbufs) if (!nbufs)
return 0; return 0;
if (bl->buf_nr_pages) { if (bl->is_mapped) {
int j;
i = bl->buf_ring->tail - bl->head; i = bl->buf_ring->tail - bl->head;
for (j = 0; j < bl->buf_nr_pages; j++) if (bl->is_mmap) {
unpin_user_page(bl->buf_pages[j]); folio_put(virt_to_folio(bl->buf_ring));
kvfree(bl->buf_pages); bl->buf_ring = NULL;
bl->buf_pages = NULL; bl->is_mmap = 0;
bl->buf_nr_pages = 0; } else if (bl->buf_nr_pages) {
int j;
for (j = 0; j < bl->buf_nr_pages; j++)
unpin_user_page(bl->buf_pages[j]);
kvfree(bl->buf_pages);
bl->buf_pages = NULL;
bl->buf_nr_pages = 0;
}
/* make sure it's seen as empty */ /* make sure it's seen as empty */
INIT_LIST_HEAD(&bl->buf_list); INIT_LIST_HEAD(&bl->buf_list);
bl->is_mapped = 0;
return i; return i;
} }
@ -304,7 +312,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
if (bl) { if (bl) {
ret = -EINVAL; ret = -EINVAL;
/* can't use provide/remove buffers command on mapped buffers */ /* can't use provide/remove buffers command on mapped buffers */
if (!bl->buf_nr_pages) if (!bl->is_mapped)
ret = __io_remove_buffers(ctx, bl, p->nbufs); ret = __io_remove_buffers(ctx, bl, p->nbufs);
} }
io_ring_submit_unlock(ctx, issue_flags); io_ring_submit_unlock(ctx, issue_flags);
@ -449,7 +457,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
} }
} }
/* can't add buffers via this command for a mapped buffer ring */ /* can't add buffers via this command for a mapped buffer ring */
if (bl->buf_nr_pages) { if (bl->is_mapped) {
ret = -EINVAL; ret = -EINVAL;
goto err; goto err;
} }
@ -464,23 +472,98 @@ err:
return IOU_OK; return IOU_OK;
} }
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
struct io_buffer_list *bl)
{ {
struct io_uring_buf_ring *br; struct io_uring_buf_ring *br;
struct page **pages;
int i, nr_pages;
pages = io_pin_pages(reg->ring_addr,
flex_array_size(br, bufs, reg->ring_entries),
&nr_pages);
if (IS_ERR(pages))
return PTR_ERR(pages);
/*
* Apparently some 32-bit boxes (ARM) will return highmem pages,
* which then need to be mapped. We could support that, but it'd
* complicate the code and slowdown the common cases quite a bit.
* So just error out, returning -EINVAL just like we did on kernels
* that didn't support mapped buffer rings.
*/
for (i = 0; i < nr_pages; i++)
if (PageHighMem(pages[i]))
goto error_unpin;
br = page_address(pages[0]);
#ifdef SHM_COLOUR
/*
* On platforms that have specific aliasing requirements, SHM_COLOUR
* is set and we must guarantee that the kernel and user side align
* nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
* the application mmap's the provided ring buffer. Fail the request
* if we, by chance, don't end up with aligned addresses. The app
* should use IOU_PBUF_RING_MMAP instead, and liburing will handle
* this transparently.
*/
if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1))
goto error_unpin;
#endif
bl->buf_pages = pages;
bl->buf_nr_pages = nr_pages;
bl->buf_ring = br;
bl->is_mapped = 1;
bl->is_mmap = 0;
return 0;
error_unpin:
for (i = 0; i < nr_pages; i++)
unpin_user_page(pages[i]);
kvfree(pages);
return -EINVAL;
}
static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,
struct io_buffer_list *bl)
{
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
size_t ring_size;
void *ptr;
ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
ptr = (void *) __get_free_pages(gfp, get_order(ring_size));
if (!ptr)
return -ENOMEM;
bl->buf_ring = ptr;
bl->is_mapped = 1;
bl->is_mmap = 1;
return 0;
}
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_reg reg; struct io_uring_buf_reg reg;
struct io_buffer_list *bl, *free_bl = NULL; struct io_buffer_list *bl, *free_bl = NULL;
struct page **pages; int ret;
int nr_pages;
if (copy_from_user(&reg, arg, sizeof(reg))) if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT; return -EFAULT;
if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) if (reg.resv[0] || reg.resv[1] || reg.resv[2])
return -EINVAL; return -EINVAL;
if (!reg.ring_addr) if (reg.flags & ~IOU_PBUF_RING_MMAP)
return -EFAULT;
if (reg.ring_addr & ~PAGE_MASK)
return -EINVAL; return -EINVAL;
if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
if (!reg.ring_addr)
return -EFAULT;
if (reg.ring_addr & ~PAGE_MASK)
return -EINVAL;
} else {
if (reg.ring_addr)
return -EINVAL;
}
if (!is_power_of_2(reg.ring_entries)) if (!is_power_of_2(reg.ring_entries))
return -EINVAL; return -EINVAL;
@ -497,7 +580,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
bl = io_buffer_get_list(ctx, reg.bgid); bl = io_buffer_get_list(ctx, reg.bgid);
if (bl) { if (bl) {
/* if mapped buffer ring OR classic exists, don't allow */ /* if mapped buffer ring OR classic exists, don't allow */
if (bl->buf_nr_pages || !list_empty(&bl->buf_list)) if (bl->is_mapped || !list_empty(&bl->buf_list))
return -EEXIST; return -EEXIST;
} else { } else {
free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
@ -505,22 +588,21 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return -ENOMEM; return -ENOMEM;
} }
pages = io_pin_pages(reg.ring_addr, if (!(reg.flags & IOU_PBUF_RING_MMAP))
flex_array_size(br, bufs, reg.ring_entries), ret = io_pin_pbuf_ring(&reg, bl);
&nr_pages); else
if (IS_ERR(pages)) { ret = io_alloc_pbuf_ring(&reg, bl);
kfree(free_bl);
return PTR_ERR(pages); if (!ret) {
bl->nr_entries = reg.ring_entries;
bl->mask = reg.ring_entries - 1;
io_buffer_add_list(ctx, bl, reg.bgid);
return 0;
} }
br = page_address(pages[0]); kfree(free_bl);
bl->buf_pages = pages; return ret;
bl->buf_nr_pages = nr_pages;
bl->nr_entries = reg.ring_entries;
bl->buf_ring = br;
bl->mask = reg.ring_entries - 1;
io_buffer_add_list(ctx, bl, reg.bgid);
return 0;
} }
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
@ -530,13 +612,15 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (copy_from_user(&reg, arg, sizeof(reg))) if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT; return -EFAULT;
if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) if (reg.resv[0] || reg.resv[1] || reg.resv[2])
return -EINVAL;
if (reg.flags)
return -EINVAL; return -EINVAL;
bl = io_buffer_get_list(ctx, reg.bgid); bl = io_buffer_get_list(ctx, reg.bgid);
if (!bl) if (!bl)
return -ENOENT; return -ENOENT;
if (!bl->buf_nr_pages) if (!bl->is_mapped)
return -EINVAL; return -EINVAL;
__io_remove_buffers(ctx, bl, -1U); __io_remove_buffers(ctx, bl, -1U);
@ -546,3 +630,14 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
} }
return 0; return 0;
} }
void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
{
struct io_buffer_list *bl;
bl = io_buffer_get_list(ctx, bgid);
if (!bl || !bl->is_mmap)
return NULL;
return bl->buf_ring;
}

View File

@ -23,6 +23,11 @@ struct io_buffer_list {
__u16 nr_entries; __u16 nr_entries;
__u16 head; __u16 head;
__u16 mask; __u16 mask;
/* ring mapped provided buffers */
__u8 is_mapped;
/* ring mapped provided buffers, but mmap'ed by application */
__u8 is_mmap;
}; };
struct io_buffer { struct io_buffer {
@ -50,6 +55,8 @@ unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid);
static inline void io_kbuf_recycle_ring(struct io_kiocb *req) static inline void io_kbuf_recycle_ring(struct io_kiocb *req)
{ {
/* /*

View File

@ -13,6 +13,11 @@
#include "filetable.h" #include "filetable.h"
#include "msg_ring.h" #include "msg_ring.h"
/* All valid masks for MSG_RING */
#define IORING_MSG_RING_MASK (IORING_MSG_RING_CQE_SKIP | \
IORING_MSG_RING_FLAGS_PASS)
struct io_msg { struct io_msg {
struct file *file; struct file *file;
struct file *src_file; struct file *src_file;
@ -21,7 +26,10 @@ struct io_msg {
u32 len; u32 len;
u32 cmd; u32 cmd;
u32 src_fd; u32 src_fd;
u32 dst_fd; union {
u32 dst_fd;
u32 cqe_flags;
};
u32 flags; u32 flags;
}; };
@ -91,6 +99,11 @@ static void io_msg_tw_complete(struct callback_head *head)
if (current->flags & PF_EXITING) { if (current->flags & PF_EXITING) {
ret = -EOWNERDEAD; ret = -EOWNERDEAD;
} else { } else {
u32 flags = 0;
if (msg->flags & IORING_MSG_RING_FLAGS_PASS)
flags = msg->cqe_flags;
/* /*
* If the target ring is using IOPOLL mode, then we need to be * If the target ring is using IOPOLL mode, then we need to be
* holding the uring_lock for posting completions. Other ring * holding the uring_lock for posting completions. Other ring
@ -99,7 +112,7 @@ static void io_msg_tw_complete(struct callback_head *head)
*/ */
if (target_ctx->flags & IORING_SETUP_IOPOLL) if (target_ctx->flags & IORING_SETUP_IOPOLL)
mutex_lock(&target_ctx->uring_lock); mutex_lock(&target_ctx->uring_lock);
if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags))
ret = -EOVERFLOW; ret = -EOVERFLOW;
if (target_ctx->flags & IORING_SETUP_IOPOLL) if (target_ctx->flags & IORING_SETUP_IOPOLL)
mutex_unlock(&target_ctx->uring_lock); mutex_unlock(&target_ctx->uring_lock);
@ -114,9 +127,12 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
{ {
struct io_ring_ctx *target_ctx = req->file->private_data; struct io_ring_ctx *target_ctx = req->file->private_data;
struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
u32 flags = 0;
int ret; int ret;
if (msg->src_fd || msg->dst_fd || msg->flags) if (msg->src_fd || msg->flags & ~IORING_MSG_RING_FLAGS_PASS)
return -EINVAL;
if (!(msg->flags & IORING_MSG_RING_FLAGS_PASS) && msg->dst_fd)
return -EINVAL; return -EINVAL;
if (target_ctx->flags & IORING_SETUP_R_DISABLED) if (target_ctx->flags & IORING_SETUP_R_DISABLED)
return -EBADFD; return -EBADFD;
@ -124,15 +140,18 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
if (io_msg_need_remote(target_ctx)) if (io_msg_need_remote(target_ctx))
return io_msg_exec_remote(req, io_msg_tw_complete); return io_msg_exec_remote(req, io_msg_tw_complete);
if (msg->flags & IORING_MSG_RING_FLAGS_PASS)
flags = msg->cqe_flags;
ret = -EOVERFLOW; ret = -EOVERFLOW;
if (target_ctx->flags & IORING_SETUP_IOPOLL) { if (target_ctx->flags & IORING_SETUP_IOPOLL) {
if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) if (unlikely(io_double_lock_ctx(target_ctx, issue_flags)))
return -EAGAIN; return -EAGAIN;
if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags))
ret = 0; ret = 0;
io_double_unlock_ctx(target_ctx); io_double_unlock_ctx(target_ctx);
} else { } else {
if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags))
ret = 0; ret = 0;
} }
return ret; return ret;
@ -143,14 +162,12 @@ static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_fl
struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
struct file *file = NULL; struct file *file = NULL;
unsigned long file_ptr;
int idx = msg->src_fd; int idx = msg->src_fd;
io_ring_submit_lock(ctx, issue_flags); io_ring_submit_lock(ctx, issue_flags);
if (likely(idx < ctx->nr_user_files)) { if (likely(idx < ctx->nr_user_files)) {
idx = array_index_nospec(idx, ctx->nr_user_files); idx = array_index_nospec(idx, ctx->nr_user_files);
file_ptr = io_fixed_file_slot(&ctx->file_table, idx)->file_ptr; file = io_file_from_index(&ctx->file_table, idx);
file = (struct file *) (file_ptr & FFS_MASK);
if (file) if (file)
get_file(file); get_file(file);
} }
@ -243,7 +260,7 @@ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
msg->src_fd = READ_ONCE(sqe->addr3); msg->src_fd = READ_ONCE(sqe->addr3);
msg->dst_fd = READ_ONCE(sqe->file_index); msg->dst_fd = READ_ONCE(sqe->file_index);
msg->flags = READ_ONCE(sqe->msg_ring_flags); msg->flags = READ_ONCE(sqe->msg_ring_flags);
if (msg->flags & ~IORING_MSG_RING_CQE_SKIP) if (msg->flags & ~IORING_MSG_RING_MASK)
return -EINVAL; return -EINVAL;
return 0; return 0;

View File

@ -92,6 +92,7 @@ int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EINVAL; return -EINVAL;
shutdown->how = READ_ONCE(sqe->len); shutdown->how = READ_ONCE(sqe->len);
req->flags |= REQ_F_FORCE_ASYNC;
return 0; return 0;
} }
@ -101,8 +102,7 @@ int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
struct socket *sock; struct socket *sock;
int ret; int ret;
if (issue_flags & IO_URING_F_NONBLOCK) WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
return -EAGAIN;
sock = sock_from_file(req->file); sock = sock_from_file(req->file);
if (unlikely(!sock)) if (unlikely(!sock))
@ -183,10 +183,14 @@ static int io_setup_async_msg(struct io_kiocb *req,
memcpy(async_msg, kmsg, sizeof(*kmsg)); memcpy(async_msg, kmsg, sizeof(*kmsg));
if (async_msg->msg.msg_name) if (async_msg->msg.msg_name)
async_msg->msg.msg_name = &async_msg->addr; async_msg->msg.msg_name = &async_msg->addr;
if ((req->flags & REQ_F_BUFFER_SELECT) && !async_msg->msg.msg_iter.nr_segs)
return -EAGAIN;
/* if were using fast_iov, set it to the new one */ /* if were using fast_iov, set it to the new one */
if (!kmsg->free_iov) { if (iter_is_iovec(&kmsg->msg.msg_iter) && !kmsg->free_iov) {
size_t fast_idx = kmsg->msg.msg_iter.iov - kmsg->fast_iov; size_t fast_idx = iter_iov(&kmsg->msg.msg_iter) - kmsg->fast_iov;
async_msg->msg.msg_iter.iov = &async_msg->fast_iov[fast_idx]; async_msg->msg.msg_iter.__iov = &async_msg->fast_iov[fast_idx];
} }
return -EAGAIN; return -EAGAIN;
@ -354,7 +358,6 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
struct sockaddr_storage __address; struct sockaddr_storage __address;
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct msghdr msg; struct msghdr msg;
struct iovec iov;
struct socket *sock; struct socket *sock;
unsigned flags; unsigned flags;
int min_ret = 0; int min_ret = 0;
@ -388,7 +391,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!sock)) if (unlikely(!sock))
return -ENOTSOCK; return -ENOTSOCK;
ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); ret = import_ubuf(WRITE, sr->buf, sr->len, &msg.msg_iter);
if (unlikely(ret)) if (unlikely(ret))
return ret; return ret;
@ -398,6 +401,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
if (flags & MSG_WAITALL) if (flags & MSG_WAITALL)
min_ret = iov_iter_count(&msg.msg_iter); min_ret = iov_iter_count(&msg.msg_iter);
flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
msg.msg_flags = flags; msg.msg_flags = flags;
ret = sock_sendmsg(sock, &msg); ret = sock_sendmsg(sock, &msg);
if (ret < min_ret) { if (ret < min_ret) {
@ -542,6 +546,7 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
struct io_async_msghdr *iomsg) struct io_async_msghdr *iomsg)
{ {
iomsg->msg.msg_name = &iomsg->addr; iomsg->msg.msg_name = &iomsg->addr;
iomsg->msg.msg_iter.nr_segs = 0;
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
if (req->ctx->compat) if (req->ctx->compat)
@ -625,9 +630,15 @@ static inline void io_recv_prep_retry(struct io_kiocb *req)
* again (for multishot). * again (for multishot).
*/ */
static inline bool io_recv_finish(struct io_kiocb *req, int *ret, static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
unsigned int cflags, bool mshot_finished, struct msghdr *msg, bool mshot_finished,
unsigned issue_flags) unsigned issue_flags)
{ {
unsigned int cflags;
cflags = io_put_kbuf(req, issue_flags);
if (msg->msg_inq && msg->msg_inq != -1)
cflags |= IORING_CQE_F_SOCK_NONEMPTY;
if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
io_req_set_res(req, *ret, cflags); io_req_set_res(req, *ret, cflags);
*ret = IOU_OK; *ret = IOU_OK;
@ -635,10 +646,18 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
} }
if (!mshot_finished) { if (!mshot_finished) {
if (io_aux_cqe(req->ctx, issue_flags & IO_URING_F_COMPLETE_DEFER, if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
req->cqe.user_data, *ret, cflags | IORING_CQE_F_MORE, true)) { *ret, cflags | IORING_CQE_F_MORE)) {
io_recv_prep_retry(req); io_recv_prep_retry(req);
return false; /* Known not-empty or unknown state, retry */
if (cflags & IORING_CQE_F_SOCK_NONEMPTY ||
msg->msg_inq == -1)
return false;
if (issue_flags & IO_URING_F_MULTISHOT)
*ret = IOU_ISSUE_SKIP_COMPLETE;
else
*ret = -EAGAIN;
return true;
} }
/* Otherwise stop multishot but use the current result. */ /* Otherwise stop multishot but use the current result. */
} }
@ -741,7 +760,6 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct io_async_msghdr iomsg, *kmsg; struct io_async_msghdr iomsg, *kmsg;
struct socket *sock; struct socket *sock;
unsigned int cflags;
unsigned flags; unsigned flags;
int ret, min_ret = 0; int ret, min_ret = 0;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
@ -784,25 +802,26 @@ retry_multishot:
} }
} }
kmsg->fast_iov[0].iov_base = buf; iov_iter_ubuf(&kmsg->msg.msg_iter, READ, buf, len);
kmsg->fast_iov[0].iov_len = len;
iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1,
len);
} }
flags = sr->msg_flags; flags = sr->msg_flags;
if (force_nonblock) if (force_nonblock)
flags |= MSG_DONTWAIT; flags |= MSG_DONTWAIT;
if (flags & MSG_WAITALL)
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
kmsg->msg.msg_get_inq = 1; kmsg->msg.msg_get_inq = 1;
if (req->flags & REQ_F_APOLL_MULTISHOT) kmsg->msg.msg_inq = -1;
if (req->flags & REQ_F_APOLL_MULTISHOT) {
ret = io_recvmsg_multishot(sock, sr, kmsg, flags, ret = io_recvmsg_multishot(sock, sr, kmsg, flags,
&mshot_finished); &mshot_finished);
else } else {
/* disable partial retry for recvmsg with cmsg attached */
if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen)
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg,
kmsg->uaddr, flags); kmsg->uaddr, flags);
}
if (ret < min_ret) { if (ret < min_ret) {
if (ret == -EAGAIN && force_nonblock) { if (ret == -EAGAIN && force_nonblock) {
@ -832,11 +851,7 @@ retry_multishot:
else else
io_kbuf_recycle(req, issue_flags); io_kbuf_recycle(req, issue_flags);
cflags = io_put_kbuf(req, issue_flags); if (!io_recv_finish(req, &ret, &kmsg->msg, mshot_finished, issue_flags))
if (kmsg->msg.msg_inq)
cflags |= IORING_CQE_F_SOCK_NONEMPTY;
if (!io_recv_finish(req, &ret, cflags, mshot_finished, issue_flags))
goto retry_multishot; goto retry_multishot;
if (mshot_finished) { if (mshot_finished) {
@ -855,8 +870,6 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct msghdr msg; struct msghdr msg;
struct socket *sock; struct socket *sock;
struct iovec iov;
unsigned int cflags;
unsigned flags; unsigned flags;
int ret, min_ret = 0; int ret, min_ret = 0;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
@ -873,6 +886,14 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!sock)) if (unlikely(!sock))
return -ENOTSOCK; return -ENOTSOCK;
msg.msg_name = NULL;
msg.msg_namelen = 0;
msg.msg_control = NULL;
msg.msg_get_inq = 1;
msg.msg_controllen = 0;
msg.msg_iocb = NULL;
msg.msg_ubuf = NULL;
retry_multishot: retry_multishot:
if (io_do_buffer_select(req)) { if (io_do_buffer_select(req)) {
void __user *buf; void __user *buf;
@ -883,18 +904,12 @@ retry_multishot:
sr->buf = buf; sr->buf = buf;
} }
ret = import_single_range(READ, sr->buf, len, &iov, &msg.msg_iter); ret = import_ubuf(READ, sr->buf, len, &msg.msg_iter);
if (unlikely(ret)) if (unlikely(ret))
goto out_free; goto out_free;
msg.msg_name = NULL; msg.msg_inq = -1;
msg.msg_namelen = 0;
msg.msg_control = NULL;
msg.msg_get_inq = 1;
msg.msg_flags = 0; msg.msg_flags = 0;
msg.msg_controllen = 0;
msg.msg_iocb = NULL;
msg.msg_ubuf = NULL;
flags = sr->msg_flags; flags = sr->msg_flags;
if (force_nonblock) if (force_nonblock)
@ -934,11 +949,7 @@ out_free:
else else
io_kbuf_recycle(req, issue_flags); io_kbuf_recycle(req, issue_flags);
cflags = io_put_kbuf(req, issue_flags); if (!io_recv_finish(req, &ret, &msg, ret <= 0, issue_flags))
if (msg.msg_inq)
cflags |= IORING_CQE_F_SOCK_NONEMPTY;
if (!io_recv_finish(req, &ret, cflags, ret <= 0, issue_flags))
goto retry_multishot; goto retry_multishot;
return ret; return ret;
@ -1094,7 +1105,6 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
struct sockaddr_storage __address; struct sockaddr_storage __address;
struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
struct msghdr msg; struct msghdr msg;
struct iovec iov;
struct socket *sock; struct socket *sock;
unsigned msg_flags; unsigned msg_flags;
int ret, min_ret = 0; int ret, min_ret = 0;
@ -1136,8 +1146,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
msg.sg_from_iter = io_sg_from_iter; msg.sg_from_iter = io_sg_from_iter;
} else { } else {
io_notif_set_extended(zc->notif); io_notif_set_extended(zc->notif);
ret = import_single_range(WRITE, zc->buf, zc->len, &iov, ret = import_ubuf(WRITE, zc->buf, zc->len, &msg.msg_iter);
&msg.msg_iter);
if (unlikely(ret)) if (unlikely(ret))
return ret; return ret;
ret = io_notif_account_mem(zc->notif, zc->len); ret = io_notif_account_mem(zc->notif, zc->len);
@ -1151,6 +1160,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
msg_flags |= MSG_DONTWAIT; msg_flags |= MSG_DONTWAIT;
if (msg_flags & MSG_WAITALL) if (msg_flags & MSG_WAITALL)
min_ret = iov_iter_count(&msg.msg_iter); min_ret = iov_iter_count(&msg.msg_iter);
msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
msg.msg_flags = msg_flags; msg.msg_flags = msg_flags;
msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg;
@ -1312,7 +1322,6 @@ int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
int io_accept(struct io_kiocb *req, unsigned int issue_flags) int io_accept(struct io_kiocb *req, unsigned int issue_flags)
{ {
struct io_ring_ctx *ctx = req->ctx;
struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
@ -1362,8 +1371,8 @@ retry:
if (ret < 0) if (ret < 0)
return ret; return ret;
if (io_aux_cqe(ctx, issue_flags & IO_URING_F_COMPLETE_DEFER, if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
req->cqe.user_data, ret, IORING_CQE_F_MORE, true)) ret, IORING_CQE_F_MORE))
goto retry; goto retry;
return -ECANCELED; return -ECANCELED;

View File

@ -5,8 +5,8 @@
#include "alloc_cache.h" #include "alloc_cache.h"
#if defined(CONFIG_NET)
struct io_async_msghdr { struct io_async_msghdr {
#if defined(CONFIG_NET)
union { union {
struct iovec fast_iov[UIO_FASTIOV]; struct iovec fast_iov[UIO_FASTIOV];
struct { struct {
@ -22,8 +22,11 @@ struct io_async_msghdr {
struct sockaddr __user *uaddr; struct sockaddr __user *uaddr;
struct msghdr msg; struct msghdr msg;
struct sockaddr_storage addr; struct sockaddr_storage addr;
#endif
}; };
#if defined(CONFIG_NET)
struct io_async_connect { struct io_async_connect {
struct sockaddr_storage address; struct sockaddr_storage address;
}; };

View File

@ -9,7 +9,7 @@
#include "notif.h" #include "notif.h"
#include "rsrc.h" #include "rsrc.h"
static void io_notif_complete_tw_ext(struct io_kiocb *notif, bool *locked) static void io_notif_complete_tw_ext(struct io_kiocb *notif, struct io_tw_state *ts)
{ {
struct io_notif_data *nd = io_notif_to_data(notif); struct io_notif_data *nd = io_notif_to_data(notif);
struct io_ring_ctx *ctx = notif->ctx; struct io_ring_ctx *ctx = notif->ctx;
@ -21,7 +21,7 @@ static void io_notif_complete_tw_ext(struct io_kiocb *notif, bool *locked)
__io_unaccount_mem(ctx->user, nd->account_pages); __io_unaccount_mem(ctx->user, nd->account_pages);
nd->account_pages = 0; nd->account_pages = 0;
} }
io_req_task_complete(notif, locked); io_req_task_complete(notif, ts);
} }
static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg, static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg,
@ -31,7 +31,7 @@ static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg,
struct io_kiocb *notif = cmd_to_io_kiocb(nd); struct io_kiocb *notif = cmd_to_io_kiocb(nd);
if (refcount_dec_and_test(&uarg->refcnt)) if (refcount_dec_and_test(&uarg->refcnt))
io_req_task_work_add(notif); __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE);
} }
static void io_tx_ubuf_callback_ext(struct sk_buff *skb, struct ubuf_info *uarg, static void io_tx_ubuf_callback_ext(struct sk_buff *skb, struct ubuf_info *uarg,
@ -68,9 +68,8 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
struct io_kiocb *notif; struct io_kiocb *notif;
struct io_notif_data *nd; struct io_notif_data *nd;
if (unlikely(!io_alloc_req_refill(ctx))) if (unlikely(!io_alloc_req(ctx, &notif)))
return NULL; return NULL;
notif = io_alloc_req(ctx);
notif->opcode = IORING_OP_NOP; notif->opcode = IORING_OP_NOP;
notif->flags = 0; notif->flags = 0;
notif->file = NULL; notif->file = NULL;
@ -80,7 +79,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
notif->io_task_work.func = io_req_task_complete; notif->io_task_work.func = io_req_task_complete;
nd = io_notif_to_data(notif); nd = io_notif_to_data(notif);
nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; nd->uarg.flags = IO_NOTIF_UBUF_FLAGS;
nd->uarg.callback = io_tx_ubuf_callback; nd->uarg.callback = io_tx_ubuf_callback;
refcount_set(&nd->uarg.refcnt, 1); refcount_set(&nd->uarg.refcnt, 1);
return notif; return notif;

View File

@ -7,6 +7,7 @@
#include "rsrc.h" #include "rsrc.h"
#define IO_NOTIF_UBUF_FLAGS (SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN)
#define IO_NOTIF_SPLICE_BATCH 32 #define IO_NOTIF_SPLICE_BATCH 32
struct io_notif_data { struct io_notif_data {
@ -33,7 +34,7 @@ static inline void io_notif_flush(struct io_kiocb *notif)
/* drop slot's master ref */ /* drop slot's master ref */
if (refcount_dec_and_test(&nd->uarg.refcnt)) if (refcount_dec_and_test(&nd->uarg.refcnt))
io_req_task_work_add(notif); __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE);
} }
static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len) static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len)

View File

@ -46,11 +46,10 @@ static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb,
return -EOPNOTSUPP; return -EOPNOTSUPP;
} }
const struct io_op_def io_op_defs[] = { const struct io_issue_def io_issue_defs[] = {
[IORING_OP_NOP] = { [IORING_OP_NOP] = {
.audit_skip = 1, .audit_skip = 1,
.iopoll = 1, .iopoll = 1,
.name = "NOP",
.prep = io_nop_prep, .prep = io_nop_prep,
.issue = io_nop, .issue = io_nop,
}, },
@ -64,13 +63,8 @@ const struct io_op_def io_op_defs[] = {
.ioprio = 1, .ioprio = 1,
.iopoll = 1, .iopoll = 1,
.iopoll_queue = 1, .iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw),
.name = "READV",
.prep = io_prep_rw, .prep = io_prep_rw,
.issue = io_read, .issue = io_read,
.prep_async = io_readv_prep_async,
.cleanup = io_readv_writev_cleanup,
.fail = io_rw_fail,
}, },
[IORING_OP_WRITEV] = { [IORING_OP_WRITEV] = {
.needs_file = 1, .needs_file = 1,
@ -82,18 +76,12 @@ const struct io_op_def io_op_defs[] = {
.ioprio = 1, .ioprio = 1,
.iopoll = 1, .iopoll = 1,
.iopoll_queue = 1, .iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw),
.name = "WRITEV",
.prep = io_prep_rw, .prep = io_prep_rw,
.issue = io_write, .issue = io_write,
.prep_async = io_writev_prep_async,
.cleanup = io_readv_writev_cleanup,
.fail = io_rw_fail,
}, },
[IORING_OP_FSYNC] = { [IORING_OP_FSYNC] = {
.needs_file = 1, .needs_file = 1,
.audit_skip = 1, .audit_skip = 1,
.name = "FSYNC",
.prep = io_fsync_prep, .prep = io_fsync_prep,
.issue = io_fsync, .issue = io_fsync,
}, },
@ -106,11 +94,8 @@ const struct io_op_def io_op_defs[] = {
.ioprio = 1, .ioprio = 1,
.iopoll = 1, .iopoll = 1,
.iopoll_queue = 1, .iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw),
.name = "READ_FIXED",
.prep = io_prep_rw, .prep = io_prep_rw,
.issue = io_read, .issue = io_read,
.fail = io_rw_fail,
}, },
[IORING_OP_WRITE_FIXED] = { [IORING_OP_WRITE_FIXED] = {
.needs_file = 1, .needs_file = 1,
@ -122,30 +107,24 @@ const struct io_op_def io_op_defs[] = {
.ioprio = 1, .ioprio = 1,
.iopoll = 1, .iopoll = 1,
.iopoll_queue = 1, .iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw),
.name = "WRITE_FIXED",
.prep = io_prep_rw, .prep = io_prep_rw,
.issue = io_write, .issue = io_write,
.fail = io_rw_fail,
}, },
[IORING_OP_POLL_ADD] = { [IORING_OP_POLL_ADD] = {
.needs_file = 1, .needs_file = 1,
.unbound_nonreg_file = 1, .unbound_nonreg_file = 1,
.audit_skip = 1, .audit_skip = 1,
.name = "POLL_ADD",
.prep = io_poll_add_prep, .prep = io_poll_add_prep,
.issue = io_poll_add, .issue = io_poll_add,
}, },
[IORING_OP_POLL_REMOVE] = { [IORING_OP_POLL_REMOVE] = {
.audit_skip = 1, .audit_skip = 1,
.name = "POLL_REMOVE",
.prep = io_poll_remove_prep, .prep = io_poll_remove_prep,
.issue = io_poll_remove, .issue = io_poll_remove,
}, },
[IORING_OP_SYNC_FILE_RANGE] = { [IORING_OP_SYNC_FILE_RANGE] = {
.needs_file = 1, .needs_file = 1,
.audit_skip = 1, .audit_skip = 1,
.name = "SYNC_FILE_RANGE",
.prep = io_sfr_prep, .prep = io_sfr_prep,
.issue = io_sync_file_range, .issue = io_sync_file_range,
}, },
@ -155,14 +134,9 @@ const struct io_op_def io_op_defs[] = {
.pollout = 1, .pollout = 1,
.ioprio = 1, .ioprio = 1,
.manual_alloc = 1, .manual_alloc = 1,
.name = "SENDMSG",
#if defined(CONFIG_NET) #if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.prep = io_sendmsg_prep, .prep = io_sendmsg_prep,
.issue = io_sendmsg, .issue = io_sendmsg,
.prep_async = io_sendmsg_prep_async,
.cleanup = io_sendmsg_recvmsg_cleanup,
.fail = io_sendrecv_fail,
#else #else
.prep = io_eopnotsupp_prep, .prep = io_eopnotsupp_prep,
#endif #endif
@ -174,29 +148,21 @@ const struct io_op_def io_op_defs[] = {
.buffer_select = 1, .buffer_select = 1,
.ioprio = 1, .ioprio = 1,
.manual_alloc = 1, .manual_alloc = 1,
.name = "RECVMSG",
#if defined(CONFIG_NET) #if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.prep = io_recvmsg_prep, .prep = io_recvmsg_prep,
.issue = io_recvmsg, .issue = io_recvmsg,
.prep_async = io_recvmsg_prep_async,
.cleanup = io_sendmsg_recvmsg_cleanup,
.fail = io_sendrecv_fail,
#else #else
.prep = io_eopnotsupp_prep, .prep = io_eopnotsupp_prep,
#endif #endif
}, },
[IORING_OP_TIMEOUT] = { [IORING_OP_TIMEOUT] = {
.audit_skip = 1, .audit_skip = 1,
.async_size = sizeof(struct io_timeout_data),
.name = "TIMEOUT",
.prep = io_timeout_prep, .prep = io_timeout_prep,
.issue = io_timeout, .issue = io_timeout,
}, },
[IORING_OP_TIMEOUT_REMOVE] = { [IORING_OP_TIMEOUT_REMOVE] = {
/* used by timeout updates' prep() */ /* used by timeout updates' prep() */
.audit_skip = 1, .audit_skip = 1,
.name = "TIMEOUT_REMOVE",
.prep = io_timeout_remove_prep, .prep = io_timeout_remove_prep,
.issue = io_timeout_remove, .issue = io_timeout_remove,
}, },
@ -206,7 +172,6 @@ const struct io_op_def io_op_defs[] = {
.pollin = 1, .pollin = 1,
.poll_exclusive = 1, .poll_exclusive = 1,
.ioprio = 1, /* used for flags */ .ioprio = 1, /* used for flags */
.name = "ACCEPT",
#if defined(CONFIG_NET) #if defined(CONFIG_NET)
.prep = io_accept_prep, .prep = io_accept_prep,
.issue = io_accept, .issue = io_accept,
@ -216,14 +181,11 @@ const struct io_op_def io_op_defs[] = {
}, },
[IORING_OP_ASYNC_CANCEL] = { [IORING_OP_ASYNC_CANCEL] = {
.audit_skip = 1, .audit_skip = 1,
.name = "ASYNC_CANCEL",
.prep = io_async_cancel_prep, .prep = io_async_cancel_prep,
.issue = io_async_cancel, .issue = io_async_cancel,
}, },
[IORING_OP_LINK_TIMEOUT] = { [IORING_OP_LINK_TIMEOUT] = {
.audit_skip = 1, .audit_skip = 1,
.async_size = sizeof(struct io_timeout_data),
.name = "LINK_TIMEOUT",
.prep = io_link_timeout_prep, .prep = io_link_timeout_prep,
.issue = io_no_issue, .issue = io_no_issue,
}, },
@ -231,46 +193,36 @@ const struct io_op_def io_op_defs[] = {
.needs_file = 1, .needs_file = 1,
.unbound_nonreg_file = 1, .unbound_nonreg_file = 1,
.pollout = 1, .pollout = 1,
.name = "CONNECT",
#if defined(CONFIG_NET) #if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_connect),
.prep = io_connect_prep, .prep = io_connect_prep,
.issue = io_connect, .issue = io_connect,
.prep_async = io_connect_prep_async,
#else #else
.prep = io_eopnotsupp_prep, .prep = io_eopnotsupp_prep,
#endif #endif
}, },
[IORING_OP_FALLOCATE] = { [IORING_OP_FALLOCATE] = {
.needs_file = 1, .needs_file = 1,
.name = "FALLOCATE",
.prep = io_fallocate_prep, .prep = io_fallocate_prep,
.issue = io_fallocate, .issue = io_fallocate,
}, },
[IORING_OP_OPENAT] = { [IORING_OP_OPENAT] = {
.name = "OPENAT",
.prep = io_openat_prep, .prep = io_openat_prep,
.issue = io_openat, .issue = io_openat,
.cleanup = io_open_cleanup,
}, },
[IORING_OP_CLOSE] = { [IORING_OP_CLOSE] = {
.name = "CLOSE",
.prep = io_close_prep, .prep = io_close_prep,
.issue = io_close, .issue = io_close,
}, },
[IORING_OP_FILES_UPDATE] = { [IORING_OP_FILES_UPDATE] = {
.audit_skip = 1, .audit_skip = 1,
.iopoll = 1, .iopoll = 1,
.name = "FILES_UPDATE",
.prep = io_files_update_prep, .prep = io_files_update_prep,
.issue = io_files_update, .issue = io_files_update,
}, },
[IORING_OP_STATX] = { [IORING_OP_STATX] = {
.audit_skip = 1, .audit_skip = 1,
.name = "STATX",
.prep = io_statx_prep, .prep = io_statx_prep,
.issue = io_statx, .issue = io_statx,
.cleanup = io_statx_cleanup,
}, },
[IORING_OP_READ] = { [IORING_OP_READ] = {
.needs_file = 1, .needs_file = 1,
@ -282,11 +234,8 @@ const struct io_op_def io_op_defs[] = {
.ioprio = 1, .ioprio = 1,
.iopoll = 1, .iopoll = 1,
.iopoll_queue = 1, .iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw),
.name = "READ",
.prep = io_prep_rw, .prep = io_prep_rw,
.issue = io_read, .issue = io_read,
.fail = io_rw_fail,
}, },
[IORING_OP_WRITE] = { [IORING_OP_WRITE] = {
.needs_file = 1, .needs_file = 1,
@ -298,22 +247,17 @@ const struct io_op_def io_op_defs[] = {
.ioprio = 1, .ioprio = 1,
.iopoll = 1, .iopoll = 1,
.iopoll_queue = 1, .iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw),
.name = "WRITE",
.prep = io_prep_rw, .prep = io_prep_rw,
.issue = io_write, .issue = io_write,
.fail = io_rw_fail,
}, },
[IORING_OP_FADVISE] = { [IORING_OP_FADVISE] = {
.needs_file = 1, .needs_file = 1,
.audit_skip = 1, .audit_skip = 1,
.name = "FADVISE",
.prep = io_fadvise_prep, .prep = io_fadvise_prep,
.issue = io_fadvise, .issue = io_fadvise,
}, },
[IORING_OP_MADVISE] = { [IORING_OP_MADVISE] = {
.audit_skip = 1, .audit_skip = 1,
.name = "MADVISE",
.prep = io_madvise_prep, .prep = io_madvise_prep,
.issue = io_madvise, .issue = io_madvise,
}, },
@ -324,13 +268,9 @@ const struct io_op_def io_op_defs[] = {
.audit_skip = 1, .audit_skip = 1,
.ioprio = 1, .ioprio = 1,
.manual_alloc = 1, .manual_alloc = 1,
.name = "SEND",
#if defined(CONFIG_NET) #if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.prep = io_sendmsg_prep, .prep = io_sendmsg_prep,
.issue = io_send, .issue = io_send,
.fail = io_sendrecv_fail,
.prep_async = io_send_prep_async,
#else #else
.prep = io_eopnotsupp_prep, .prep = io_eopnotsupp_prep,
#endif #endif
@ -342,25 +282,20 @@ const struct io_op_def io_op_defs[] = {
.buffer_select = 1, .buffer_select = 1,
.audit_skip = 1, .audit_skip = 1,
.ioprio = 1, .ioprio = 1,
.name = "RECV",
#if defined(CONFIG_NET) #if defined(CONFIG_NET)
.prep = io_recvmsg_prep, .prep = io_recvmsg_prep,
.issue = io_recv, .issue = io_recv,
.fail = io_sendrecv_fail,
#else #else
.prep = io_eopnotsupp_prep, .prep = io_eopnotsupp_prep,
#endif #endif
}, },
[IORING_OP_OPENAT2] = { [IORING_OP_OPENAT2] = {
.name = "OPENAT2",
.prep = io_openat2_prep, .prep = io_openat2_prep,
.issue = io_openat2, .issue = io_openat2,
.cleanup = io_open_cleanup,
}, },
[IORING_OP_EPOLL_CTL] = { [IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1, .unbound_nonreg_file = 1,
.audit_skip = 1, .audit_skip = 1,
.name = "EPOLL",
#if defined(CONFIG_EPOLL) #if defined(CONFIG_EPOLL)
.prep = io_epoll_ctl_prep, .prep = io_epoll_ctl_prep,
.issue = io_epoll_ctl, .issue = io_epoll_ctl,
@ -373,21 +308,18 @@ const struct io_op_def io_op_defs[] = {
.hash_reg_file = 1, .hash_reg_file = 1,
.unbound_nonreg_file = 1, .unbound_nonreg_file = 1,
.audit_skip = 1, .audit_skip = 1,
.name = "SPLICE",
.prep = io_splice_prep, .prep = io_splice_prep,
.issue = io_splice, .issue = io_splice,
}, },
[IORING_OP_PROVIDE_BUFFERS] = { [IORING_OP_PROVIDE_BUFFERS] = {
.audit_skip = 1, .audit_skip = 1,
.iopoll = 1, .iopoll = 1,
.name = "PROVIDE_BUFFERS",
.prep = io_provide_buffers_prep, .prep = io_provide_buffers_prep,
.issue = io_provide_buffers, .issue = io_provide_buffers,
}, },
[IORING_OP_REMOVE_BUFFERS] = { [IORING_OP_REMOVE_BUFFERS] = {
.audit_skip = 1, .audit_skip = 1,
.iopoll = 1, .iopoll = 1,
.name = "REMOVE_BUFFERS",
.prep = io_remove_buffers_prep, .prep = io_remove_buffers_prep,
.issue = io_remove_buffers, .issue = io_remove_buffers,
}, },
@ -396,13 +328,11 @@ const struct io_op_def io_op_defs[] = {
.hash_reg_file = 1, .hash_reg_file = 1,
.unbound_nonreg_file = 1, .unbound_nonreg_file = 1,
.audit_skip = 1, .audit_skip = 1,
.name = "TEE",
.prep = io_tee_prep, .prep = io_tee_prep,
.issue = io_tee, .issue = io_tee,
}, },
[IORING_OP_SHUTDOWN] = { [IORING_OP_SHUTDOWN] = {
.needs_file = 1, .needs_file = 1,
.name = "SHUTDOWN",
#if defined(CONFIG_NET) #if defined(CONFIG_NET)
.prep = io_shutdown_prep, .prep = io_shutdown_prep,
.issue = io_shutdown, .issue = io_shutdown,
@ -411,72 +341,51 @@ const struct io_op_def io_op_defs[] = {
#endif #endif
}, },
[IORING_OP_RENAMEAT] = { [IORING_OP_RENAMEAT] = {
.name = "RENAMEAT",
.prep = io_renameat_prep, .prep = io_renameat_prep,
.issue = io_renameat, .issue = io_renameat,
.cleanup = io_renameat_cleanup,
}, },
[IORING_OP_UNLINKAT] = { [IORING_OP_UNLINKAT] = {
.name = "UNLINKAT",
.prep = io_unlinkat_prep, .prep = io_unlinkat_prep,
.issue = io_unlinkat, .issue = io_unlinkat,
.cleanup = io_unlinkat_cleanup,
}, },
[IORING_OP_MKDIRAT] = { [IORING_OP_MKDIRAT] = {
.name = "MKDIRAT",
.prep = io_mkdirat_prep, .prep = io_mkdirat_prep,
.issue = io_mkdirat, .issue = io_mkdirat,
.cleanup = io_mkdirat_cleanup,
}, },
[IORING_OP_SYMLINKAT] = { [IORING_OP_SYMLINKAT] = {
.name = "SYMLINKAT",
.prep = io_symlinkat_prep, .prep = io_symlinkat_prep,
.issue = io_symlinkat, .issue = io_symlinkat,
.cleanup = io_link_cleanup,
}, },
[IORING_OP_LINKAT] = { [IORING_OP_LINKAT] = {
.name = "LINKAT",
.prep = io_linkat_prep, .prep = io_linkat_prep,
.issue = io_linkat, .issue = io_linkat,
.cleanup = io_link_cleanup,
}, },
[IORING_OP_MSG_RING] = { [IORING_OP_MSG_RING] = {
.needs_file = 1, .needs_file = 1,
.iopoll = 1, .iopoll = 1,
.name = "MSG_RING",
.prep = io_msg_ring_prep, .prep = io_msg_ring_prep,
.issue = io_msg_ring, .issue = io_msg_ring,
.cleanup = io_msg_ring_cleanup,
}, },
[IORING_OP_FSETXATTR] = { [IORING_OP_FSETXATTR] = {
.needs_file = 1, .needs_file = 1,
.name = "FSETXATTR",
.prep = io_fsetxattr_prep, .prep = io_fsetxattr_prep,
.issue = io_fsetxattr, .issue = io_fsetxattr,
.cleanup = io_xattr_cleanup,
}, },
[IORING_OP_SETXATTR] = { [IORING_OP_SETXATTR] = {
.name = "SETXATTR",
.prep = io_setxattr_prep, .prep = io_setxattr_prep,
.issue = io_setxattr, .issue = io_setxattr,
.cleanup = io_xattr_cleanup,
}, },
[IORING_OP_FGETXATTR] = { [IORING_OP_FGETXATTR] = {
.needs_file = 1, .needs_file = 1,
.name = "FGETXATTR",
.prep = io_fgetxattr_prep, .prep = io_fgetxattr_prep,
.issue = io_fgetxattr, .issue = io_fgetxattr,
.cleanup = io_xattr_cleanup,
}, },
[IORING_OP_GETXATTR] = { [IORING_OP_GETXATTR] = {
.name = "GETXATTR",
.prep = io_getxattr_prep, .prep = io_getxattr_prep,
.issue = io_getxattr, .issue = io_getxattr,
.cleanup = io_xattr_cleanup,
}, },
[IORING_OP_SOCKET] = { [IORING_OP_SOCKET] = {
.audit_skip = 1, .audit_skip = 1,
.name = "SOCKET",
#if defined(CONFIG_NET) #if defined(CONFIG_NET)
.prep = io_socket_prep, .prep = io_socket_prep,
.issue = io_socket, .issue = io_socket,
@ -487,16 +396,12 @@ const struct io_op_def io_op_defs[] = {
[IORING_OP_URING_CMD] = { [IORING_OP_URING_CMD] = {
.needs_file = 1, .needs_file = 1,
.plug = 1, .plug = 1,
.name = "URING_CMD",
.iopoll = 1, .iopoll = 1,
.iopoll_queue = 1, .iopoll_queue = 1,
.async_size = uring_cmd_pdu_size(1),
.prep = io_uring_cmd_prep, .prep = io_uring_cmd_prep,
.issue = io_uring_cmd, .issue = io_uring_cmd,
.prep_async = io_uring_cmd_prep_async,
}, },
[IORING_OP_SEND_ZC] = { [IORING_OP_SEND_ZC] = {
.name = "SEND_ZC",
.needs_file = 1, .needs_file = 1,
.unbound_nonreg_file = 1, .unbound_nonreg_file = 1,
.pollout = 1, .pollout = 1,
@ -504,32 +409,243 @@ const struct io_op_def io_op_defs[] = {
.ioprio = 1, .ioprio = 1,
.manual_alloc = 1, .manual_alloc = 1,
#if defined(CONFIG_NET) #if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.prep = io_send_zc_prep, .prep = io_send_zc_prep,
.issue = io_send_zc, .issue = io_send_zc,
.prep_async = io_send_prep_async,
.cleanup = io_send_zc_cleanup,
.fail = io_sendrecv_fail,
#else #else
.prep = io_eopnotsupp_prep, .prep = io_eopnotsupp_prep,
#endif #endif
}, },
[IORING_OP_SENDMSG_ZC] = { [IORING_OP_SENDMSG_ZC] = {
.name = "SENDMSG_ZC",
.needs_file = 1, .needs_file = 1,
.unbound_nonreg_file = 1, .unbound_nonreg_file = 1,
.pollout = 1, .pollout = 1,
.ioprio = 1, .ioprio = 1,
.manual_alloc = 1, .manual_alloc = 1,
#if defined(CONFIG_NET) #if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.prep = io_send_zc_prep, .prep = io_send_zc_prep,
.issue = io_sendmsg_zc, .issue = io_sendmsg_zc,
#else
.prep = io_eopnotsupp_prep,
#endif
},
};
const struct io_cold_def io_cold_defs[] = {
[IORING_OP_NOP] = {
.name = "NOP",
},
[IORING_OP_READV] = {
.async_size = sizeof(struct io_async_rw),
.name = "READV",
.prep_async = io_readv_prep_async,
.cleanup = io_readv_writev_cleanup,
.fail = io_rw_fail,
},
[IORING_OP_WRITEV] = {
.async_size = sizeof(struct io_async_rw),
.name = "WRITEV",
.prep_async = io_writev_prep_async,
.cleanup = io_readv_writev_cleanup,
.fail = io_rw_fail,
},
[IORING_OP_FSYNC] = {
.name = "FSYNC",
},
[IORING_OP_READ_FIXED] = {
.async_size = sizeof(struct io_async_rw),
.name = "READ_FIXED",
.fail = io_rw_fail,
},
[IORING_OP_WRITE_FIXED] = {
.async_size = sizeof(struct io_async_rw),
.name = "WRITE_FIXED",
.fail = io_rw_fail,
},
[IORING_OP_POLL_ADD] = {
.name = "POLL_ADD",
},
[IORING_OP_POLL_REMOVE] = {
.name = "POLL_REMOVE",
},
[IORING_OP_SYNC_FILE_RANGE] = {
.name = "SYNC_FILE_RANGE",
},
[IORING_OP_SENDMSG] = {
.name = "SENDMSG",
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.prep_async = io_sendmsg_prep_async,
.cleanup = io_sendmsg_recvmsg_cleanup,
.fail = io_sendrecv_fail,
#endif
},
[IORING_OP_RECVMSG] = {
.name = "RECVMSG",
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.prep_async = io_recvmsg_prep_async,
.cleanup = io_sendmsg_recvmsg_cleanup,
.fail = io_sendrecv_fail,
#endif
},
[IORING_OP_TIMEOUT] = {
.async_size = sizeof(struct io_timeout_data),
.name = "TIMEOUT",
},
[IORING_OP_TIMEOUT_REMOVE] = {
.name = "TIMEOUT_REMOVE",
},
[IORING_OP_ACCEPT] = {
.name = "ACCEPT",
},
[IORING_OP_ASYNC_CANCEL] = {
.name = "ASYNC_CANCEL",
},
[IORING_OP_LINK_TIMEOUT] = {
.async_size = sizeof(struct io_timeout_data),
.name = "LINK_TIMEOUT",
},
[IORING_OP_CONNECT] = {
.name = "CONNECT",
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_connect),
.prep_async = io_connect_prep_async,
#endif
},
[IORING_OP_FALLOCATE] = {
.name = "FALLOCATE",
},
[IORING_OP_OPENAT] = {
.name = "OPENAT",
.cleanup = io_open_cleanup,
},
[IORING_OP_CLOSE] = {
.name = "CLOSE",
},
[IORING_OP_FILES_UPDATE] = {
.name = "FILES_UPDATE",
},
[IORING_OP_STATX] = {
.name = "STATX",
.cleanup = io_statx_cleanup,
},
[IORING_OP_READ] = {
.async_size = sizeof(struct io_async_rw),
.name = "READ",
.fail = io_rw_fail,
},
[IORING_OP_WRITE] = {
.async_size = sizeof(struct io_async_rw),
.name = "WRITE",
.fail = io_rw_fail,
},
[IORING_OP_FADVISE] = {
.name = "FADVISE",
},
[IORING_OP_MADVISE] = {
.name = "MADVISE",
},
[IORING_OP_SEND] = {
.name = "SEND",
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.fail = io_sendrecv_fail,
.prep_async = io_send_prep_async,
#endif
},
[IORING_OP_RECV] = {
.name = "RECV",
#if defined(CONFIG_NET)
.fail = io_sendrecv_fail,
#endif
},
[IORING_OP_OPENAT2] = {
.name = "OPENAT2",
.cleanup = io_open_cleanup,
},
[IORING_OP_EPOLL_CTL] = {
.name = "EPOLL",
},
[IORING_OP_SPLICE] = {
.name = "SPLICE",
},
[IORING_OP_PROVIDE_BUFFERS] = {
.name = "PROVIDE_BUFFERS",
},
[IORING_OP_REMOVE_BUFFERS] = {
.name = "REMOVE_BUFFERS",
},
[IORING_OP_TEE] = {
.name = "TEE",
},
[IORING_OP_SHUTDOWN] = {
.name = "SHUTDOWN",
},
[IORING_OP_RENAMEAT] = {
.name = "RENAMEAT",
.cleanup = io_renameat_cleanup,
},
[IORING_OP_UNLINKAT] = {
.name = "UNLINKAT",
.cleanup = io_unlinkat_cleanup,
},
[IORING_OP_MKDIRAT] = {
.name = "MKDIRAT",
.cleanup = io_mkdirat_cleanup,
},
[IORING_OP_SYMLINKAT] = {
.name = "SYMLINKAT",
.cleanup = io_link_cleanup,
},
[IORING_OP_LINKAT] = {
.name = "LINKAT",
.cleanup = io_link_cleanup,
},
[IORING_OP_MSG_RING] = {
.name = "MSG_RING",
.cleanup = io_msg_ring_cleanup,
},
[IORING_OP_FSETXATTR] = {
.name = "FSETXATTR",
.cleanup = io_xattr_cleanup,
},
[IORING_OP_SETXATTR] = {
.name = "SETXATTR",
.cleanup = io_xattr_cleanup,
},
[IORING_OP_FGETXATTR] = {
.name = "FGETXATTR",
.cleanup = io_xattr_cleanup,
},
[IORING_OP_GETXATTR] = {
.name = "GETXATTR",
.cleanup = io_xattr_cleanup,
},
[IORING_OP_SOCKET] = {
.name = "SOCKET",
},
[IORING_OP_URING_CMD] = {
.name = "URING_CMD",
.async_size = 2 * sizeof(struct io_uring_sqe),
.prep_async = io_uring_cmd_prep_async,
},
[IORING_OP_SEND_ZC] = {
.name = "SEND_ZC",
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.prep_async = io_send_prep_async,
.cleanup = io_send_zc_cleanup,
.fail = io_sendrecv_fail,
#endif
},
[IORING_OP_SENDMSG_ZC] = {
.name = "SENDMSG_ZC",
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.prep_async = io_sendmsg_prep_async, .prep_async = io_sendmsg_prep_async,
.cleanup = io_send_zc_cleanup, .cleanup = io_send_zc_cleanup,
.fail = io_sendrecv_fail, .fail = io_sendrecv_fail,
#else
.prep = io_eopnotsupp_prep,
#endif #endif
}, },
}; };
@ -537,7 +653,7 @@ const struct io_op_def io_op_defs[] = {
const char *io_uring_get_opcode(u8 opcode) const char *io_uring_get_opcode(u8 opcode)
{ {
if (opcode < IORING_OP_LAST) if (opcode < IORING_OP_LAST)
return io_op_defs[opcode].name; return io_cold_defs[opcode].name;
return "INVALID"; return "INVALID";
} }
@ -545,12 +661,13 @@ void __init io_uring_optable_init(void)
{ {
int i; int i;
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); BUILD_BUG_ON(ARRAY_SIZE(io_cold_defs) != IORING_OP_LAST);
BUILD_BUG_ON(ARRAY_SIZE(io_issue_defs) != IORING_OP_LAST);
for (i = 0; i < ARRAY_SIZE(io_op_defs); i++) { for (i = 0; i < ARRAY_SIZE(io_issue_defs); i++) {
BUG_ON(!io_op_defs[i].prep); BUG_ON(!io_issue_defs[i].prep);
if (io_op_defs[i].prep != io_eopnotsupp_prep) if (io_issue_defs[i].prep != io_eopnotsupp_prep)
BUG_ON(!io_op_defs[i].issue); BUG_ON(!io_issue_defs[i].issue);
WARN_ON_ONCE(!io_op_defs[i].name); WARN_ON_ONCE(!io_cold_defs[i].name);
} }
} }

View File

@ -2,7 +2,7 @@
#ifndef IOU_OP_DEF_H #ifndef IOU_OP_DEF_H
#define IOU_OP_DEF_H #define IOU_OP_DEF_H
struct io_op_def { struct io_issue_def {
/* needs req->file assigned */ /* needs req->file assigned */
unsigned needs_file : 1; unsigned needs_file : 1;
/* should block plug */ /* should block plug */
@ -29,19 +29,24 @@ struct io_op_def {
unsigned iopoll_queue : 1; unsigned iopoll_queue : 1;
/* opcode specific path will handle ->async_data allocation if needed */ /* opcode specific path will handle ->async_data allocation if needed */
unsigned manual_alloc : 1; unsigned manual_alloc : 1;
int (*issue)(struct io_kiocb *, unsigned int);
int (*prep)(struct io_kiocb *, const struct io_uring_sqe *);
};
struct io_cold_def {
/* size of async data needed, if any */ /* size of async data needed, if any */
unsigned short async_size; unsigned short async_size;
const char *name; const char *name;
int (*prep)(struct io_kiocb *, const struct io_uring_sqe *);
int (*issue)(struct io_kiocb *, unsigned int);
int (*prep_async)(struct io_kiocb *); int (*prep_async)(struct io_kiocb *);
void (*cleanup)(struct io_kiocb *); void (*cleanup)(struct io_kiocb *);
void (*fail)(struct io_kiocb *); void (*fail)(struct io_kiocb *);
}; };
extern const struct io_op_def io_op_defs[]; extern const struct io_issue_def io_issue_defs[];
extern const struct io_cold_def io_cold_defs[];
void io_uring_optable_init(void); void io_uring_optable_init(void);
#endif #endif

View File

@ -31,6 +31,17 @@ struct io_close {
u32 file_slot; u32 file_slot;
}; };
static bool io_openat_force_async(struct io_open *open)
{
/*
* Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
* it'll always -EAGAIN. Note that we test for __O_TMPFILE because
* O_TMPFILE includes O_DIRECTORY, which isn't a flag we need to force
* async for.
*/
return open->how.flags & (O_TRUNC | O_CREAT | __O_TMPFILE);
}
static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{ {
struct io_open *open = io_kiocb_to_cmd(req, struct io_open); struct io_open *open = io_kiocb_to_cmd(req, struct io_open);
@ -61,6 +72,8 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
open->nofile = rlimit(RLIMIT_NOFILE); open->nofile = rlimit(RLIMIT_NOFILE);
req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_NEED_CLEANUP;
if (io_openat_force_async(open))
req->flags |= REQ_F_FORCE_ASYNC;
return 0; return 0;
} }
@ -108,12 +121,7 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
nonblock_set = op.open_flag & O_NONBLOCK; nonblock_set = op.open_flag & O_NONBLOCK;
resolve_nonblock = open->how.resolve & RESOLVE_CACHED; resolve_nonblock = open->how.resolve & RESOLVE_CACHED;
if (issue_flags & IO_URING_F_NONBLOCK) { if (issue_flags & IO_URING_F_NONBLOCK) {
/* WARN_ON_ONCE(io_openat_force_async(open));
* Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
* it'll always -EAGAIN
*/
if (open->how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
return -EAGAIN;
op.lookup_flags |= LOOKUP_CACHED; op.lookup_flags |= LOOKUP_CACHED;
op.open_flag |= O_NONBLOCK; op.open_flag |= O_NONBLOCK;
} }
@ -144,7 +152,6 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
file->f_flags &= ~O_NONBLOCK; file->f_flags &= ~O_NONBLOCK;
fsnotify_open(file);
if (!fixed) if (!fixed)
fd_install(ret, file); fd_install(ret, file);

View File

@ -51,6 +51,9 @@ struct io_poll_table {
#define IO_WQE_F_DOUBLE 1 #define IO_WQE_F_DOUBLE 1
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
void *key);
static inline struct io_kiocb *wqe_to_req(struct wait_queue_entry *wqe) static inline struct io_kiocb *wqe_to_req(struct wait_queue_entry *wqe)
{ {
unsigned long priv = (unsigned long)wqe->private; unsigned long priv = (unsigned long)wqe->private;
@ -145,7 +148,7 @@ static void io_poll_req_insert_locked(struct io_kiocb *req)
hlist_add_head(&req->hash_node, &table->hbs[index].list); hlist_add_head(&req->hash_node, &table->hbs[index].list);
} }
static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked) static void io_poll_tw_hash_eject(struct io_kiocb *req, struct io_tw_state *ts)
{ {
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
@ -156,7 +159,7 @@ static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked)
* already grabbed the mutex for us, but there is a chance it * already grabbed the mutex for us, but there is a chance it
* failed. * failed.
*/ */
io_tw_lock(ctx, locked); io_tw_lock(ctx, ts);
hash_del(&req->hash_node); hash_del(&req->hash_node);
req->flags &= ~REQ_F_HASH_LOCKED; req->flags &= ~REQ_F_HASH_LOCKED;
} else { } else {
@ -164,15 +167,14 @@ static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked)
} }
} }
static void io_init_poll_iocb(struct io_poll *poll, __poll_t events, static void io_init_poll_iocb(struct io_poll *poll, __poll_t events)
wait_queue_func_t wake_func)
{ {
poll->head = NULL; poll->head = NULL;
#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) #define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
/* mask in events that we always want/need */ /* mask in events that we always want/need */
poll->events = events | IO_POLL_UNMASK; poll->events = events | IO_POLL_UNMASK;
INIT_LIST_HEAD(&poll->wait.entry); INIT_LIST_HEAD(&poll->wait.entry);
init_waitqueue_func_entry(&poll->wait, wake_func); init_waitqueue_func_entry(&poll->wait, io_poll_wake);
} }
static inline void io_poll_remove_entry(struct io_poll *poll) static inline void io_poll_remove_entry(struct io_poll *poll)
@ -236,7 +238,7 @@ enum {
* req->cqe.res. IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot * req->cqe.res. IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot
* poll and that the result is stored in req->cqe. * poll and that the result is stored in req->cqe.
*/ */
static int io_poll_check_events(struct io_kiocb *req, bool *locked) static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts)
{ {
int v; int v;
@ -298,13 +300,13 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
__poll_t mask = mangle_poll(req->cqe.res & __poll_t mask = mangle_poll(req->cqe.res &
req->apoll_events); req->apoll_events);
if (!io_aux_cqe(req->ctx, *locked, req->cqe.user_data, if (!io_fill_cqe_req_aux(req, ts->locked, mask,
mask, IORING_CQE_F_MORE, false)) { IORING_CQE_F_MORE)) {
io_req_set_res(req, mask, 0); io_req_set_res(req, mask, 0);
return IOU_POLL_REMOVE_POLL_USE_RES; return IOU_POLL_REMOVE_POLL_USE_RES;
} }
} else { } else {
int ret = io_poll_issue(req, locked); int ret = io_poll_issue(req, ts);
if (ret == IOU_STOP_MULTISHOT) if (ret == IOU_STOP_MULTISHOT)
return IOU_POLL_REMOVE_POLL_USE_RES; return IOU_POLL_REMOVE_POLL_USE_RES;
if (ret < 0) if (ret < 0)
@ -324,15 +326,15 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
return IOU_POLL_NO_ACTION; return IOU_POLL_NO_ACTION;
} }
static void io_poll_task_func(struct io_kiocb *req, bool *locked) void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts)
{ {
int ret; int ret;
ret = io_poll_check_events(req, locked); ret = io_poll_check_events(req, ts);
if (ret == IOU_POLL_NO_ACTION) if (ret == IOU_POLL_NO_ACTION)
return; return;
io_poll_remove_entries(req); io_poll_remove_entries(req);
io_poll_tw_hash_eject(req, locked); io_poll_tw_hash_eject(req, ts);
if (req->opcode == IORING_OP_POLL_ADD) { if (req->opcode == IORING_OP_POLL_ADD) {
if (ret == IOU_POLL_DONE) { if (ret == IOU_POLL_DONE) {
@ -341,7 +343,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
poll = io_kiocb_to_cmd(req, struct io_poll); poll = io_kiocb_to_cmd(req, struct io_poll);
req->cqe.res = mangle_poll(req->cqe.res & poll->events); req->cqe.res = mangle_poll(req->cqe.res & poll->events);
} else if (ret == IOU_POLL_REISSUE) { } else if (ret == IOU_POLL_REISSUE) {
io_req_task_submit(req, locked); io_req_task_submit(req, ts);
return; return;
} else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) {
req->cqe.res = ret; req->cqe.res = ret;
@ -349,14 +351,14 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
} }
io_req_set_res(req, req->cqe.res, 0); io_req_set_res(req, req->cqe.res, 0);
io_req_task_complete(req, locked); io_req_task_complete(req, ts);
} else { } else {
io_tw_lock(req->ctx, locked); io_tw_lock(req->ctx, ts);
if (ret == IOU_POLL_REMOVE_POLL_USE_RES) if (ret == IOU_POLL_REMOVE_POLL_USE_RES)
io_req_task_complete(req, locked); io_req_task_complete(req, ts);
else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE) else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE)
io_req_task_submit(req, locked); io_req_task_submit(req, ts);
else else
io_req_defer_failed(req, ret); io_req_defer_failed(req, ret);
} }
@ -508,7 +510,7 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt,
/* mark as double wq entry */ /* mark as double wq entry */
wqe_private |= IO_WQE_F_DOUBLE; wqe_private |= IO_WQE_F_DOUBLE;
io_init_poll_iocb(poll, first->events, first->wait.func); io_init_poll_iocb(poll, first->events);
if (!io_poll_double_prepare(req)) { if (!io_poll_double_prepare(req)) {
/* the request is completing, just back off */ /* the request is completing, just back off */
kfree(poll); kfree(poll);
@ -569,7 +571,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
INIT_HLIST_NODE(&req->hash_node); INIT_HLIST_NODE(&req->hash_node);
req->work.cancel_seq = atomic_read(&ctx->cancel_seq); req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
io_init_poll_iocb(poll, mask, io_poll_wake); io_init_poll_iocb(poll, mask);
poll->file = req->file; poll->file = req->file;
req->apoll_events = poll->events; req->apoll_events = poll->events;
@ -690,7 +692,7 @@ alloc_apoll:
int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
{ {
const struct io_op_def *def = &io_op_defs[req->opcode]; const struct io_issue_def *def = &io_issue_defs[req->opcode];
struct async_poll *apoll; struct async_poll *apoll;
struct io_poll_table ipt; struct io_poll_table ipt;
__poll_t mask = POLLPRI | POLLERR | EPOLLET; __poll_t mask = POLLPRI | POLLERR | EPOLLET;
@ -822,14 +824,10 @@ static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
spin_lock(&hb->lock); spin_lock(&hb->lock);
hlist_for_each_entry(req, &hb->list, hash_node) { hlist_for_each_entry(req, &hb->list, hash_node) {
if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && if (io_cancel_req_match(req, cd)) {
req->file != cd->file) *out_bucket = hb;
continue; return req;
if (cd->seq == req->work.cancel_seq) }
continue;
req->work.cancel_seq = cd->seq;
*out_bucket = hb;
return req;
} }
spin_unlock(&hb->lock); spin_unlock(&hb->lock);
} }
@ -853,7 +851,8 @@ static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
struct io_hash_bucket *bucket; struct io_hash_bucket *bucket;
struct io_kiocb *req; struct io_kiocb *req;
if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) if (cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP |
IORING_ASYNC_CANCEL_ANY))
req = io_poll_file_find(ctx, cd, table, &bucket); req = io_poll_file_find(ctx, cd, table, &bucket);
else else
req = io_poll_find(ctx, false, cd, table, &bucket); req = io_poll_find(ctx, false, cd, table, &bucket);
@ -970,12 +969,12 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
{ {
struct io_poll_update *poll_update = io_kiocb_to_cmd(req, struct io_poll_update); struct io_poll_update *poll_update = io_kiocb_to_cmd(req, struct io_poll_update);
struct io_cancel_data cd = { .data = poll_update->old_user_data, };
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
struct io_cancel_data cd = { .ctx = ctx, .data = poll_update->old_user_data, };
struct io_hash_bucket *bucket; struct io_hash_bucket *bucket;
struct io_kiocb *preq; struct io_kiocb *preq;
int ret2, ret = 0; int ret2, ret = 0;
bool locked = true; struct io_tw_state ts = { .locked = true };
io_ring_submit_lock(ctx, issue_flags); io_ring_submit_lock(ctx, issue_flags);
preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket); preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket);
@ -1024,7 +1023,7 @@ found:
req_set_fail(preq); req_set_fail(preq);
io_req_set_res(preq, -ECANCELED, 0); io_req_set_res(preq, -ECANCELED, 0);
io_req_task_complete(preq, &locked); io_req_task_complete(preq, &ts);
out: out:
io_ring_submit_unlock(ctx, issue_flags); io_ring_submit_unlock(ctx, issue_flags);
if (ret < 0) { if (ret < 0) {

View File

@ -38,3 +38,5 @@ bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
bool cancel_all); bool cancel_all);
void io_apoll_cache_free(struct io_cache_entry *entry); void io_apoll_cache_free(struct io_cache_entry *entry);
void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts);

View File

@ -23,24 +23,21 @@ struct io_rsrc_update {
u32 offset; u32 offset;
}; };
static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
struct io_mapped_ubuf **pimu, struct io_mapped_ubuf **pimu,
struct page **last_hpage); struct page **last_hpage);
#define IO_RSRC_REF_BATCH 100
/* only define max */ /* only define max */
#define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_FIXED_FILES (1U << 20)
#define IORING_MAX_REG_BUFFERS (1U << 14) #define IORING_MAX_REG_BUFFERS (1U << 14)
void io_rsrc_refs_drop(struct io_ring_ctx *ctx) static const struct io_mapped_ubuf dummy_ubuf = {
__must_hold(&ctx->uring_lock) /* set invalid range, so io_import_fixed() fails meeting it */
{ .ubuf = -1UL,
if (ctx->rsrc_cached_refs) { .ubuf_end = 0,
io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs); };
ctx->rsrc_cached_refs = 0;
}
}
int __io_account_mem(struct user_struct *user, unsigned long nr_pages) int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
{ {
@ -141,7 +138,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
struct io_mapped_ubuf *imu = *slot; struct io_mapped_ubuf *imu = *slot;
unsigned int i; unsigned int i;
if (imu != ctx->dummy_ubuf) { if (imu != &dummy_ubuf) {
for (i = 0; i < imu->nr_bvecs; i++) for (i = 0; i < imu->nr_bvecs; i++)
unpin_user_page(imu->bvec[i].bv_page); unpin_user_page(imu->bvec[i].bv_page);
if (imu->acct_pages) if (imu->acct_pages)
@ -151,216 +148,129 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
*slot = NULL; *slot = NULL;
} }
void io_rsrc_refs_refill(struct io_ring_ctx *ctx) static void io_rsrc_put_work(struct io_rsrc_node *node)
__must_hold(&ctx->uring_lock)
{ {
ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH; struct io_rsrc_put *prsrc = &node->item;
percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
}
static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) if (prsrc->tag)
{ io_post_aux_cqe(node->ctx, prsrc->tag, 0, 0);
struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
struct io_ring_ctx *ctx = rsrc_data->ctx;
struct io_rsrc_put *prsrc, *tmp;
list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) { switch (node->type) {
list_del(&prsrc->list); case IORING_RSRC_FILE:
io_rsrc_file_put(node->ctx, prsrc);
if (prsrc->tag) { break;
if (ctx->flags & IORING_SETUP_IOPOLL) { case IORING_RSRC_BUFFER:
mutex_lock(&ctx->uring_lock); io_rsrc_buf_put(node->ctx, prsrc);
io_post_aux_cqe(ctx, prsrc->tag, 0, 0); break;
mutex_unlock(&ctx->uring_lock); default:
} else { WARN_ON_ONCE(1);
io_post_aux_cqe(ctx, prsrc->tag, 0, 0); break;
}
}
rsrc_data->do_put(ctx, prsrc);
kfree(prsrc);
}
io_rsrc_node_destroy(ref_node);
if (atomic_dec_and_test(&rsrc_data->refs))
complete(&rsrc_data->done);
}
void io_rsrc_put_work(struct work_struct *work)
{
struct io_ring_ctx *ctx;
struct llist_node *node;
ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
node = llist_del_all(&ctx->rsrc_put_llist);
while (node) {
struct io_rsrc_node *ref_node;
struct llist_node *next = node->next;
ref_node = llist_entry(node, struct io_rsrc_node, llist);
__io_rsrc_put_work(ref_node);
node = next;
} }
} }
void io_rsrc_put_tw(struct callback_head *cb) void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
{ {
struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx, if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache))
rsrc_put_tw); kfree(node);
io_rsrc_put_work(&ctx->rsrc_put_work.work);
} }
void io_wait_rsrc_data(struct io_rsrc_data *data) void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
__must_hold(&node->ctx->uring_lock)
{ {
if (data && !atomic_dec_and_test(&data->refs)) struct io_ring_ctx *ctx = node->ctx;
wait_for_completion(&data->done);
}
void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
{
percpu_ref_exit(&ref_node->refs);
kfree(ref_node);
}
static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
{
struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
struct io_ring_ctx *ctx = node->rsrc_data->ctx;
unsigned long flags;
bool first_add = false;
unsigned long delay = HZ;
spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
node->done = true;
/* if we are mid-quiesce then do not delay */
if (node->rsrc_data->quiesce)
delay = 0;
while (!list_empty(&ctx->rsrc_ref_list)) { while (!list_empty(&ctx->rsrc_ref_list)) {
node = list_first_entry(&ctx->rsrc_ref_list, node = list_first_entry(&ctx->rsrc_ref_list,
struct io_rsrc_node, node); struct io_rsrc_node, node);
/* recycle ref nodes in order */ /* recycle ref nodes in order */
if (!node->done) if (node->refs)
break; break;
list_del(&node->node); list_del(&node->node);
first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
}
spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
if (!first_add) if (likely(!node->empty))
return; io_rsrc_put_work(node);
io_rsrc_node_destroy(ctx, node);
if (ctx->submitter_task) {
if (!task_work_add(ctx->submitter_task, &ctx->rsrc_put_tw,
ctx->notify_method))
return;
} }
mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce))
wake_up_all(&ctx->rsrc_quiesce_wq);
} }
static struct io_rsrc_node *io_rsrc_node_alloc(void) struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
{ {
struct io_rsrc_node *ref_node; struct io_rsrc_node *ref_node;
struct io_cache_entry *entry;
ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); entry = io_alloc_cache_get(&ctx->rsrc_node_cache);
if (!ref_node) if (entry) {
return NULL; ref_node = container_of(entry, struct io_rsrc_node, cache);
} else {
if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
0, GFP_KERNEL)) { if (!ref_node)
kfree(ref_node); return NULL;
return NULL;
} }
INIT_LIST_HEAD(&ref_node->node);
INIT_LIST_HEAD(&ref_node->rsrc_list); ref_node->ctx = ctx;
ref_node->done = false; ref_node->empty = 0;
ref_node->refs = 1;
return ref_node; return ref_node;
} }
void io_rsrc_node_switch(struct io_ring_ctx *ctx,
struct io_rsrc_data *data_to_kill)
__must_hold(&ctx->uring_lock)
{
WARN_ON_ONCE(!ctx->rsrc_backup_node);
WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
io_rsrc_refs_drop(ctx);
if (data_to_kill) {
struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
rsrc_node->rsrc_data = data_to_kill;
spin_lock_irq(&ctx->rsrc_ref_lock);
list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
spin_unlock_irq(&ctx->rsrc_ref_lock);
atomic_inc(&data_to_kill->refs);
percpu_ref_kill(&rsrc_node->refs);
ctx->rsrc_node = NULL;
}
if (!ctx->rsrc_node) {
ctx->rsrc_node = ctx->rsrc_backup_node;
ctx->rsrc_backup_node = NULL;
}
}
int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
{
if (ctx->rsrc_backup_node)
return 0;
ctx->rsrc_backup_node = io_rsrc_node_alloc();
return ctx->rsrc_backup_node ? 0 : -ENOMEM;
}
__cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
struct io_ring_ctx *ctx) struct io_ring_ctx *ctx)
{ {
struct io_rsrc_node *backup;
DEFINE_WAIT(we);
int ret; int ret;
/* As we may drop ->uring_lock, other task may have started quiesce */ /* As We may drop ->uring_lock, other task may have started quiesce */
if (data->quiesce) if (data->quiesce)
return -ENXIO; return -ENXIO;
ret = io_rsrc_node_switch_start(ctx);
if (ret)
return ret;
io_rsrc_node_switch(ctx, data);
/* kill initial ref, already quiesced if zero */ backup = io_rsrc_node_alloc(ctx);
if (atomic_dec_and_test(&data->refs)) if (!backup)
return -ENOMEM;
ctx->rsrc_node->empty = true;
ctx->rsrc_node->type = -1;
list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list);
io_put_rsrc_node(ctx, ctx->rsrc_node);
ctx->rsrc_node = backup;
if (list_empty(&ctx->rsrc_ref_list))
return 0; return 0;
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
atomic_set(&ctx->cq_wait_nr, 1);
smp_mb();
}
ctx->rsrc_quiesce++;
data->quiesce = true; data->quiesce = true;
mutex_unlock(&ctx->uring_lock);
do { do {
prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE);
mutex_unlock(&ctx->uring_lock);
ret = io_run_task_work_sig(ctx); ret = io_run_task_work_sig(ctx);
if (ret < 0) { if (ret < 0) {
atomic_inc(&data->refs);
/* wait for all works potentially completing data->done */
flush_delayed_work(&ctx->rsrc_put_work);
reinit_completion(&data->done);
mutex_lock(&ctx->uring_lock); mutex_lock(&ctx->uring_lock);
if (list_empty(&ctx->rsrc_ref_list))
ret = 0;
break; break;
} }
flush_delayed_work(&ctx->rsrc_put_work); schedule();
ret = wait_for_completion_interruptible(&data->done); __set_current_state(TASK_RUNNING);
if (!ret) { mutex_lock(&ctx->uring_lock);
mutex_lock(&ctx->uring_lock); ret = 0;
if (atomic_read(&data->refs) <= 0) } while (!list_empty(&ctx->rsrc_ref_list));
break;
/*
* it has been revived by another thread while
* we were unlocked
*/
mutex_unlock(&ctx->uring_lock);
}
} while (1);
data->quiesce = false;
finish_wait(&ctx->rsrc_quiesce_wq, &we);
data->quiesce = false;
ctx->rsrc_quiesce--;
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
atomic_set(&ctx->cq_wait_nr, 0);
smp_mb();
}
return ret; return ret;
} }
@ -405,12 +315,12 @@ static __cold void **io_alloc_page_table(size_t size)
return table; return table;
} }
__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type,
rsrc_put_fn *do_put, u64 __user *utags, u64 __user *utags,
unsigned nr, struct io_rsrc_data **pdata) unsigned nr, struct io_rsrc_data **pdata)
{ {
struct io_rsrc_data *data; struct io_rsrc_data *data;
int ret = -ENOMEM; int ret = 0;
unsigned i; unsigned i;
data = kzalloc(sizeof(*data), GFP_KERNEL); data = kzalloc(sizeof(*data), GFP_KERNEL);
@ -424,7 +334,7 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx,
data->nr = nr; data->nr = nr;
data->ctx = ctx; data->ctx = ctx;
data->do_put = do_put; data->rsrc_type = type;
if (utags) { if (utags) {
ret = -EFAULT; ret = -EFAULT;
for (i = 0; i < nr; i++) { for (i = 0; i < nr; i++) {
@ -435,9 +345,6 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx,
goto fail; goto fail;
} }
} }
atomic_set(&data->refs, 1);
init_completion(&data->done);
*pdata = data; *pdata = data;
return 0; return 0;
fail: fail:
@ -453,10 +360,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
__s32 __user *fds = u64_to_user_ptr(up->data); __s32 __user *fds = u64_to_user_ptr(up->data);
struct io_rsrc_data *data = ctx->file_data; struct io_rsrc_data *data = ctx->file_data;
struct io_fixed_file *file_slot; struct io_fixed_file *file_slot;
struct file *file;
int fd, i, err = 0; int fd, i, err = 0;
unsigned int done; unsigned int done;
bool needs_switch = false;
if (!ctx->file_data) if (!ctx->file_data)
return -ENXIO; return -ENXIO;
@ -482,16 +387,16 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
file_slot = io_fixed_file_slot(&ctx->file_table, i); file_slot = io_fixed_file_slot(&ctx->file_table, i);
if (file_slot->file_ptr) { if (file_slot->file_ptr) {
file = (struct file *)(file_slot->file_ptr & FFS_MASK); err = io_queue_rsrc_removal(data, i,
err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file); io_slot_file(file_slot));
if (err) if (err)
break; break;
file_slot->file_ptr = 0; file_slot->file_ptr = 0;
io_file_bitmap_clear(&ctx->file_table, i); io_file_bitmap_clear(&ctx->file_table, i);
needs_switch = true;
} }
if (fd != -1) { if (fd != -1) {
file = fget(fd); struct file *file = fget(fd);
if (!file) { if (!file) {
err = -EBADF; err = -EBADF;
break; break;
@ -519,9 +424,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
io_file_bitmap_set(&ctx->file_table, i); io_file_bitmap_set(&ctx->file_table, i);
} }
} }
if (needs_switch)
io_rsrc_node_switch(ctx, data);
return done ? done : err; return done ? done : err;
} }
@ -532,7 +434,6 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
u64 __user *tags = u64_to_user_ptr(up->tags); u64 __user *tags = u64_to_user_ptr(up->tags);
struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
struct page *last_hpage = NULL; struct page *last_hpage = NULL;
bool needs_switch = false;
__u32 done; __u32 done;
int i, err; int i, err;
@ -543,7 +444,6 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
for (done = 0; done < nr_args; done++) { for (done = 0; done < nr_args; done++) {
struct io_mapped_ubuf *imu; struct io_mapped_ubuf *imu;
int offset = up->offset + done;
u64 tag = 0; u64 tag = 0;
err = io_copy_iov(ctx, &iov, iovs, done); err = io_copy_iov(ctx, &iov, iovs, done);
@ -564,24 +464,20 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
if (err) if (err)
break; break;
i = array_index_nospec(offset, ctx->nr_user_bufs); i = array_index_nospec(up->offset + done, ctx->nr_user_bufs);
if (ctx->user_bufs[i] != ctx->dummy_ubuf) { if (ctx->user_bufs[i] != &dummy_ubuf) {
err = io_queue_rsrc_removal(ctx->buf_data, i, err = io_queue_rsrc_removal(ctx->buf_data, i,
ctx->rsrc_node, ctx->user_bufs[i]); ctx->user_bufs[i]);
if (unlikely(err)) { if (unlikely(err)) {
io_buffer_unmap(ctx, &imu); io_buffer_unmap(ctx, &imu);
break; break;
} }
ctx->user_bufs[i] = ctx->dummy_ubuf; ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf;
needs_switch = true;
} }
ctx->user_bufs[i] = imu; ctx->user_bufs[i] = imu;
*io_get_tag_slot(ctx->buf_data, i) = tag; *io_get_tag_slot(ctx->buf_data, i) = tag;
} }
if (needs_switch)
io_rsrc_node_switch(ctx, ctx->buf_data);
return done ? done : err; return done ? done : err;
} }
@ -590,13 +486,11 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
unsigned nr_args) unsigned nr_args)
{ {
__u32 tmp; __u32 tmp;
int err;
lockdep_assert_held(&ctx->uring_lock);
if (check_add_overflow(up->offset, nr_args, &tmp)) if (check_add_overflow(up->offset, nr_args, &tmp))
return -EOVERFLOW; return -EOVERFLOW;
err = io_rsrc_node_switch_start(ctx);
if (err)
return err;
switch (type) { switch (type) {
case IORING_RSRC_FILE: case IORING_RSRC_FILE:
@ -753,20 +647,24 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
return IOU_OK; return IOU_OK;
} }
int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc)
struct io_rsrc_node *node, void *rsrc)
{ {
struct io_ring_ctx *ctx = data->ctx;
struct io_rsrc_node *node = ctx->rsrc_node;
u64 *tag_slot = io_get_tag_slot(data, idx); u64 *tag_slot = io_get_tag_slot(data, idx);
struct io_rsrc_put *prsrc;
prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); ctx->rsrc_node = io_rsrc_node_alloc(ctx);
if (!prsrc) if (unlikely(!ctx->rsrc_node)) {
ctx->rsrc_node = node;
return -ENOMEM; return -ENOMEM;
}
prsrc->tag = *tag_slot; node->item.rsrc = rsrc;
node->type = data->rsrc_type;
node->item.tag = *tag_slot;
*tag_slot = 0; *tag_slot = 0;
prsrc->rsrc = rsrc; list_add_tail(&node->node, &ctx->rsrc_ref_list);
list_add(&prsrc->list, &node->rsrc_list); io_put_rsrc_node(ctx, node);
return 0; return 0;
} }
@ -881,20 +779,14 @@ int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
return 0; return 0;
} }
static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) static __cold void io_rsrc_file_scm_put(struct io_ring_ctx *ctx, struct file *file)
{ {
struct file *file = prsrc->file;
#if defined(CONFIG_UNIX) #if defined(CONFIG_UNIX)
struct sock *sock = ctx->ring_sock->sk; struct sock *sock = ctx->ring_sock->sk;
struct sk_buff_head list, *head = &sock->sk_receive_queue; struct sk_buff_head list, *head = &sock->sk_receive_queue;
struct sk_buff *skb; struct sk_buff *skb;
int i; int i;
if (!io_file_need_scm(file)) {
fput(file);
return;
}
__skb_queue_head_init(&list); __skb_queue_head_init(&list);
/* /*
@ -944,11 +836,19 @@ static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
__skb_queue_tail(head, skb); __skb_queue_tail(head, skb);
spin_unlock_irq(&head->lock); spin_unlock_irq(&head->lock);
} }
#else
fput(file);
#endif #endif
} }
static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
{
struct file *file = prsrc->file;
if (likely(!io_file_need_scm(file)))
fput(file);
else
io_rsrc_file_scm_put(ctx, file);
}
int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args, u64 __user *tags) unsigned nr_args, u64 __user *tags)
{ {
@ -965,10 +865,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return -EMFILE; return -EMFILE;
if (nr_args > rlimit(RLIMIT_NOFILE)) if (nr_args > rlimit(RLIMIT_NOFILE))
return -EMFILE; return -EMFILE;
ret = io_rsrc_node_switch_start(ctx); ret = io_rsrc_data_alloc(ctx, IORING_RSRC_FILE, tags, nr_args,
if (ret)
return ret;
ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
&ctx->file_data); &ctx->file_data);
if (ret) if (ret)
return ret; return ret;
@ -1022,7 +919,6 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
/* default it to the whole table */ /* default it to the whole table */
io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files); io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files);
io_rsrc_node_switch(ctx, NULL);
return 0; return 0;
fail: fail:
__io_sqe_files_unregister(ctx); __io_sqe_files_unregister(ctx);
@ -1207,8 +1103,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
unsigned long off; unsigned long off;
size_t size; size_t size;
int ret, nr_pages, i; int ret, nr_pages, i;
struct folio *folio = NULL;
*pimu = ctx->dummy_ubuf; *pimu = (struct io_mapped_ubuf *)&dummy_ubuf;
if (!iov->iov_base) if (!iov->iov_base)
return 0; return 0;
@ -1221,6 +1118,32 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
goto done; goto done;
} }
/* If it's a huge page, try to coalesce them into a single bvec entry */
if (nr_pages > 1) {
folio = page_folio(pages[0]);
for (i = 1; i < nr_pages; i++) {
/*
* Pages must be consecutive and on the same folio for
* this to work
*/
if (page_folio(pages[i]) != folio ||
pages[i] != pages[i - 1] + 1) {
folio = NULL;
break;
}
}
if (folio) {
/*
* The pages are bound to the folio, it doesn't
* actually unpin them but drops all but one reference,
* which is usually put down by io_buffer_unmap().
* Note, needs a better helper.
*/
unpin_user_pages(&pages[1], nr_pages - 1);
nr_pages = 1;
}
}
imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
if (!imu) if (!imu)
goto done; goto done;
@ -1233,22 +1156,25 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
off = (unsigned long) iov->iov_base & ~PAGE_MASK; off = (unsigned long) iov->iov_base & ~PAGE_MASK;
size = iov->iov_len; size = iov->iov_len;
for (i = 0; i < nr_pages; i++) {
size_t vec_len;
vec_len = min_t(size_t, size, PAGE_SIZE - off);
imu->bvec[i].bv_page = pages[i];
imu->bvec[i].bv_len = vec_len;
imu->bvec[i].bv_offset = off;
off = 0;
size -= vec_len;
}
/* store original address for later verification */ /* store original address for later verification */
imu->ubuf = (unsigned long) iov->iov_base; imu->ubuf = (unsigned long) iov->iov_base;
imu->ubuf_end = imu->ubuf + iov->iov_len; imu->ubuf_end = imu->ubuf + iov->iov_len;
imu->nr_bvecs = nr_pages; imu->nr_bvecs = nr_pages;
*pimu = imu; *pimu = imu;
ret = 0; ret = 0;
if (folio) {
bvec_set_page(&imu->bvec[0], pages[0], size, off);
goto done;
}
for (i = 0; i < nr_pages; i++) {
size_t vec_len;
vec_len = min_t(size_t, size, PAGE_SIZE - off);
bvec_set_page(&imu->bvec[i], pages[i], vec_len, off);
off = 0;
size -= vec_len;
}
done: done:
if (ret) if (ret)
kvfree(imu); kvfree(imu);
@ -1276,10 +1202,7 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
return -EBUSY; return -EBUSY;
if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
return -EINVAL; return -EINVAL;
ret = io_rsrc_node_switch_start(ctx); ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, tags, nr_args, &data);
if (ret)
return ret;
ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
if (ret) if (ret)
return ret; return ret;
ret = io_buffers_map_alloc(ctx, nr_args); ret = io_buffers_map_alloc(ctx, nr_args);
@ -1316,8 +1239,6 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
ctx->buf_data = data; ctx->buf_data = data;
if (ret) if (ret)
__io_sqe_buffers_unregister(ctx); __io_sqe_buffers_unregister(ctx);
else
io_rsrc_node_switch(ctx, NULL);
return ret; return ret;
} }
@ -1337,7 +1258,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
return -EFAULT; return -EFAULT;
/* /*
* May not be a start of buffer, set size appropriately * Might not be a start of buffer, set size appropriately
* and advance us to the beginning. * and advance us to the beginning.
*/ */
offset = buf_addr - imu->ubuf; offset = buf_addr - imu->ubuf;
@ -1363,7 +1284,15 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
const struct bio_vec *bvec = imu->bvec; const struct bio_vec *bvec = imu->bvec;
if (offset <= bvec->bv_len) { if (offset <= bvec->bv_len) {
iov_iter_advance(iter, offset); /*
* Note, huge pages buffers consists of one large
* bvec entry and should always go this way. The other
* branch doesn't expect non PAGE_SIZE'd chunks.
*/
iter->bvec = bvec;
iter->nr_segs = bvec->bv_len;
iter->count -= offset;
iter->iov_offset = offset;
} else { } else {
unsigned long seg_skip; unsigned long seg_skip;

View File

@ -4,6 +4,10 @@
#include <net/af_unix.h> #include <net/af_unix.h>
#include "alloc_cache.h"
#define IO_NODE_ALLOC_CACHE_MAX 32
#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) #define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) #define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT)
#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) #define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
@ -14,7 +18,6 @@ enum {
}; };
struct io_rsrc_put { struct io_rsrc_put {
struct list_head list;
u64 tag; u64 tag;
union { union {
void *rsrc; void *rsrc;
@ -30,19 +33,20 @@ struct io_rsrc_data {
u64 **tags; u64 **tags;
unsigned int nr; unsigned int nr;
rsrc_put_fn *do_put; u16 rsrc_type;
atomic_t refs;
struct completion done;
bool quiesce; bool quiesce;
}; };
struct io_rsrc_node { struct io_rsrc_node {
struct percpu_ref refs; union {
struct io_cache_entry cache;
struct io_ring_ctx *ctx;
};
int refs;
bool empty;
u16 type;
struct list_head node; struct list_head node;
struct list_head rsrc_list; struct io_rsrc_put item;
struct io_rsrc_data *rsrc_data;
struct llist_node llist;
bool done;
}; };
struct io_mapped_ubuf { struct io_mapped_ubuf {
@ -53,17 +57,10 @@ struct io_mapped_ubuf {
struct bio_vec bvec[]; struct bio_vec bvec[];
}; };
void io_rsrc_put_tw(struct callback_head *cb); void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
void io_rsrc_put_work(struct work_struct *work); void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
void io_rsrc_refs_refill(struct io_ring_ctx *ctx); struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
void io_wait_rsrc_data(struct io_rsrc_data *data); int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc);
void io_rsrc_node_destroy(struct io_rsrc_node *ref_node);
void io_rsrc_refs_drop(struct io_ring_ctx *ctx);
int io_rsrc_node_switch_start(struct io_ring_ctx *ctx);
int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
struct io_rsrc_node *node, void *rsrc);
void io_rsrc_node_switch(struct io_ring_ctx *ctx,
struct io_rsrc_data *data_to_kill);
int io_import_fixed(int ddir, struct iov_iter *iter, int io_import_fixed(int ddir, struct iov_iter *iter,
struct io_mapped_ubuf *imu, struct io_mapped_ubuf *imu,
@ -107,36 +104,24 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
unsigned int size, unsigned int type); unsigned int size, unsigned int type);
static inline void io_rsrc_put_node(struct io_rsrc_node *node, int nr) static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
{ {
percpu_ref_put_many(&node->refs, nr); lockdep_assert_held(&ctx->uring_lock);
}
static inline void io_req_put_rsrc(struct io_kiocb *req) if (node && !--node->refs)
{ io_rsrc_node_ref_zero(node);
if (req->rsrc_node)
io_rsrc_put_node(req->rsrc_node, 1);
} }
static inline void io_req_put_rsrc_locked(struct io_kiocb *req, static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
struct io_ring_ctx *ctx) struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{ {
struct io_rsrc_node *node = req->rsrc_node; io_put_rsrc_node(ctx, req->rsrc_node);
if (node) {
if (node == ctx->rsrc_node)
ctx->rsrc_cached_refs++;
else
io_rsrc_put_node(node, 1);
}
} }
static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx) static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx,
struct io_rsrc_node *node)
{ {
ctx->rsrc_cached_refs--; node->refs++;
if (unlikely(ctx->rsrc_cached_refs < 0))
io_rsrc_refs_refill(ctx);
} }
static inline void io_req_set_rsrc_node(struct io_kiocb *req, static inline void io_req_set_rsrc_node(struct io_kiocb *req,
@ -149,7 +134,7 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req,
lockdep_assert_held(&ctx->uring_lock); lockdep_assert_held(&ctx->uring_lock);
req->rsrc_node = ctx->rsrc_node; req->rsrc_node = ctx->rsrc_node;
io_charge_rsrc_node(ctx); io_charge_rsrc_node(ctx, ctx->rsrc_node);
io_ring_submit_unlock(ctx, issue_flags); io_ring_submit_unlock(ctx, issue_flags);
} }
} }
@ -162,6 +147,12 @@ static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
return &data->tags[table_idx][off]; return &data->tags[table_idx][off];
} }
static inline int io_rsrc_init(struct io_ring_ctx *ctx)
{
ctx->rsrc_node = io_rsrc_node_alloc(ctx);
return ctx->rsrc_node ? 0 : -ENOMEM;
}
int io_files_update(struct io_kiocb *req, unsigned int issue_flags); int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);

View File

@ -105,6 +105,7 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
} else { } else {
rw->kiocb.ki_ioprio = get_current_ioprio(); rw->kiocb.ki_ioprio = get_current_ioprio();
} }
rw->kiocb.dio_complete = NULL;
rw->addr = READ_ONCE(sqe->addr); rw->addr = READ_ONCE(sqe->addr);
rw->len = READ_ONCE(sqe->len); rw->len = READ_ONCE(sqe->len);
@ -283,16 +284,25 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
return res; return res;
} }
static void io_req_rw_complete(struct io_kiocb *req, bool *locked) void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts)
{ {
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
struct kiocb *kiocb = &rw->kiocb;
if ((kiocb->ki_flags & IOCB_DIO_CALLER_COMP) && kiocb->dio_complete) {
long res = kiocb->dio_complete(rw->kiocb.private);
io_req_set_res(req, io_fixup_rw_res(req, res), 0);
}
io_req_io_end(req); io_req_io_end(req);
if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED; unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED;
req->cqe.flags |= io_put_kbuf(req, issue_flags); req->cqe.flags |= io_put_kbuf(req, issue_flags);
} }
io_req_task_complete(req, locked); io_req_task_complete(req, ts);
} }
static void io_complete_rw(struct kiocb *kiocb, long res) static void io_complete_rw(struct kiocb *kiocb, long res)
@ -300,11 +310,13 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
struct io_kiocb *req = cmd_to_io_kiocb(rw); struct io_kiocb *req = cmd_to_io_kiocb(rw);
if (__io_complete_rw_common(req, res)) if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) {
return; if (__io_complete_rw_common(req, res))
io_req_set_res(req, io_fixup_rw_res(req, res), 0); return;
io_req_set_res(req, io_fixup_rw_res(req, res), 0);
}
req->io_task_work.func = io_req_rw_complete; req->io_task_work.func = io_req_rw_complete;
io_req_task_work_add(req); __io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE);
} }
static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
@ -332,7 +344,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
unsigned final_ret = io_fixup_rw_res(req, ret); unsigned final_ret = io_fixup_rw_res(req, ret);
if (req->flags & REQ_F_CUR_POS) if (ret >= 0 && req->flags & REQ_F_CUR_POS)
req->file->f_pos = rw->kiocb.ki_pos; req->file->f_pos = rw->kiocb.ki_pos;
if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) { if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) {
if (!__io_complete_rw_common(req, ret)) { if (!__io_complete_rw_common(req, ret)) {
@ -391,7 +403,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
rw->len = sqe_len; rw->len = sqe_len;
} }
ret = import_single_range(ddir, buf, sqe_len, s->fast_iov, iter); ret = import_ubuf(ddir, buf, sqe_len, iter);
if (ret) if (ret)
return ERR_PTR(ret); return ERR_PTR(ret);
return NULL; return NULL;
@ -410,7 +422,7 @@ static inline int io_import_iovec(int rw, struct io_kiocb *req,
unsigned int issue_flags) unsigned int issue_flags)
{ {
*iovec = __io_import_iovec(rw, req, s, issue_flags); *iovec = __io_import_iovec(rw, req, s, issue_flags);
if (unlikely(IS_ERR(*iovec))) if (IS_ERR(*iovec))
return PTR_ERR(*iovec); return PTR_ERR(*iovec);
iov_iter_save_state(&s->iter, &s->iter_state); iov_iter_save_state(&s->iter, &s->iter_state);
@ -447,23 +459,25 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
ppos = io_kiocb_ppos(kiocb); ppos = io_kiocb_ppos(kiocb);
while (iov_iter_count(iter)) { while (iov_iter_count(iter)) {
struct iovec iovec; void __user *addr;
size_t len;
ssize_t nr; ssize_t nr;
if (!iov_iter_is_bvec(iter)) { if (iter_is_ubuf(iter)) {
iovec = iov_iter_iovec(iter); addr = iter->ubuf + iter->iov_offset;
len = iov_iter_count(iter);
} else if (!iov_iter_is_bvec(iter)) {
addr = iter_iov_addr(iter);
len = iter_iov_len(iter);
} else { } else {
iovec.iov_base = u64_to_user_ptr(rw->addr); addr = u64_to_user_ptr(rw->addr);
iovec.iov_len = rw->len; len = rw->len;
} }
if (ddir == READ) { if (ddir == READ)
nr = file->f_op->read(file, iovec.iov_base, nr = file->f_op->read(file, addr, len, ppos);
iovec.iov_len, ppos); else
} else { nr = file->f_op->write(file, addr, len, ppos);
nr = file->f_op->write(file, iovec.iov_base,
iovec.iov_len, ppos);
}
if (nr < 0) { if (nr < 0) {
if (!ret) if (!ret)
@ -479,7 +493,7 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
if (!rw->len) if (!rw->len)
break; break;
} }
if (nr != iovec.iov_len) if (nr != len)
break; break;
} }
@ -495,15 +509,15 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
io->free_iovec = iovec; io->free_iovec = iovec;
io->bytes_done = 0; io->bytes_done = 0;
/* can only be fixed buffers, no need to do anything */ /* can only be fixed buffers, no need to do anything */
if (iov_iter_is_bvec(iter)) if (iov_iter_is_bvec(iter) || iter_is_ubuf(iter))
return; return;
if (!iovec) { if (!iovec) {
unsigned iov_off = 0; unsigned iov_off = 0;
io->s.iter.iov = io->s.fast_iov; io->s.iter.__iov = io->s.fast_iov;
if (iter->iov != fast_iov) { if (iter->__iov != fast_iov) {
iov_off = iter->iov - fast_iov; iov_off = iter_iov(iter) - fast_iov;
io->s.iter.iov += iov_off; io->s.iter.__iov += iov_off;
} }
if (io->s.fast_iov != fast_iov) if (io->s.fast_iov != fast_iov)
memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off, memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off,
@ -516,7 +530,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
struct io_rw_state *s, bool force) struct io_rw_state *s, bool force)
{ {
if (!force && !io_op_defs[req->opcode].prep_async) if (!force && !io_cold_defs[req->opcode].prep_async)
return 0; return 0;
if (!req_has_async_data(req)) { if (!req_has_async_data(req)) {
struct io_async_rw *iorw; struct io_async_rw *iorw;
@ -664,8 +678,8 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
if (unlikely(!file || !(file->f_mode & mode))) if (unlikely(!file || !(file->f_mode & mode)))
return -EBADF; return -EBADF;
if (!io_req_ffs_set(req)) if (!(req->flags & REQ_F_FIXED_FILE))
req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; req->flags |= io_file_get_flags(file);
kiocb->ki_flags = iocb_flags(file); kiocb->ki_flags = iocb_flags(file);
ret = kiocb_set_rw_flags(kiocb, rw->flags); ret = kiocb_set_rw_flags(kiocb, rw->flags);
@ -981,13 +995,6 @@ copy_iov:
return ret; return ret;
} }
static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
{
io_commit_cqring_flush(ctx);
if (ctx->flags & IORING_SETUP_SQPOLL)
io_cqring_wake(ctx);
}
void io_rw_fail(struct io_kiocb *req) void io_rw_fail(struct io_kiocb *req)
{ {
int res; int res;
@ -1058,24 +1065,17 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
if (!smp_load_acquire(&req->iopoll_completed)) if (!smp_load_acquire(&req->iopoll_completed))
break; break;
nr_events++; nr_events++;
if (unlikely(req->flags & REQ_F_CQE_SKIP))
continue;
req->cqe.flags = io_put_kbuf(req, 0); req->cqe.flags = io_put_kbuf(req, 0);
if (unlikely(!__io_fill_cqe_req(ctx, req))) {
spin_lock(&ctx->completion_lock);
io_req_cqe_overflow(req);
spin_unlock(&ctx->completion_lock);
}
} }
if (unlikely(!nr_events)) if (unlikely(!nr_events))
return 0; return 0;
io_commit_cqring(ctx);
io_cqring_ev_posted_iopoll(ctx);
pos = start ? start->next : ctx->iopoll_list.first; pos = start ? start->next : ctx->iopoll_list.first;
wq_list_cut(&ctx->iopoll_list, prev, start); wq_list_cut(&ctx->iopoll_list, prev, start);
io_free_batch_list(ctx, pos);
if (WARN_ON_ONCE(!wq_list_empty(&ctx->submit_state.compl_reqs)))
return 0;
ctx->submit_state.compl_reqs.first = pos;
__io_submit_flush_completions(ctx);
return nr_events; return nr_events;
} }

View File

@ -22,3 +22,4 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags);
int io_writev_prep_async(struct io_kiocb *req); int io_writev_prep_async(struct io_kiocb *req);
void io_readv_writev_cleanup(struct io_kiocb *req); void io_readv_writev_cleanup(struct io_kiocb *req);
void io_rw_fail(struct io_kiocb *req); void io_rw_fail(struct io_kiocb *req);
void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts);

View File

@ -3,6 +3,9 @@
#include <linux/io_uring_types.h> #include <linux/io_uring_types.h>
#define __wq_list_for_each(pos, head) \
for (pos = (head)->first; pos; pos = (pos)->next)
#define wq_list_for_each(pos, prv, head) \ #define wq_list_for_each(pos, prv, head) \
for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
@ -27,28 +30,6 @@ static inline void wq_list_add_after(struct io_wq_work_node *node,
list->last = node; list->last = node;
} }
/**
* wq_list_merge - merge the second list to the first one.
* @list0: the first list
* @list1: the second list
* Return the first node after mergence.
*/
static inline struct io_wq_work_node *wq_list_merge(struct io_wq_work_list *list0,
struct io_wq_work_list *list1)
{
struct io_wq_work_node *ret;
if (!list0->first) {
ret = list1->first;
} else {
ret = list0->first;
list0->last->next = list1->first;
}
INIT_WQ_LIST(list0);
INIT_WQ_LIST(list1);
return ret;
}
static inline void wq_list_add_tail(struct io_wq_work_node *node, static inline void wq_list_add_tail(struct io_wq_work_node *node,
struct io_wq_work_list *list) struct io_wq_work_list *list)
{ {
@ -135,4 +116,4 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
return container_of(work->list.next, struct io_wq_work, list); return container_of(work->list.next, struct io_wq_work, list);
} }
#endif // INTERNAL_IO_SLIST_H #endif // INTERNAL_IO_SLIST_H

View File

@ -34,6 +34,7 @@ static int __io_splice_prep(struct io_kiocb *req,
if (unlikely(sp->flags & ~valid_flags)) if (unlikely(sp->flags & ~valid_flags))
return -EINVAL; return -EINVAL;
sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in); sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in);
req->flags |= REQ_F_FORCE_ASYNC;
return 0; return 0;
} }
@ -52,8 +53,7 @@ int io_tee(struct io_kiocb *req, unsigned int issue_flags)
struct file *in; struct file *in;
long ret = 0; long ret = 0;
if (issue_flags & IO_URING_F_NONBLOCK) WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
return -EAGAIN;
if (sp->flags & SPLICE_F_FD_IN_FIXED) if (sp->flags & SPLICE_F_FD_IN_FIXED)
in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
@ -68,7 +68,7 @@ int io_tee(struct io_kiocb *req, unsigned int issue_flags)
ret = do_tee(in, out, sp->len, flags); ret = do_tee(in, out, sp->len, flags);
if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
io_put_file(in); fput(in);
done: done:
if (ret != sp->len) if (ret != sp->len)
req_set_fail(req); req_set_fail(req);
@ -94,8 +94,7 @@ int io_splice(struct io_kiocb *req, unsigned int issue_flags)
struct file *in; struct file *in;
long ret = 0; long ret = 0;
if (issue_flags & IO_URING_F_NONBLOCK) WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
return -EAGAIN;
if (sp->flags & SPLICE_F_FD_IN_FIXED) if (sp->flags & SPLICE_F_FD_IN_FIXED)
in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
@ -113,7 +112,7 @@ int io_splice(struct io_kiocb *req, unsigned int issue_flags)
ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
io_put_file(in); fput(in);
done: done:
if (ret != sp->len) if (ret != sp->len)
req_set_fail(req); req_set_fail(req);

View File

@ -255,9 +255,13 @@ static int io_sq_thread(void *data)
sqt_spin = true; sqt_spin = true;
if (sqt_spin || !time_after(jiffies, timeout)) { if (sqt_spin || !time_after(jiffies, timeout)) {
cond_resched();
if (sqt_spin) if (sqt_spin)
timeout = jiffies + sqd->sq_thread_idle; timeout = jiffies + sqd->sq_thread_idle;
if (unlikely(need_resched())) {
mutex_unlock(&sqd->lock);
cond_resched();
mutex_lock(&sqd->lock);
}
continue; continue;
} }
@ -311,7 +315,7 @@ static int io_sq_thread(void *data)
do_exit(0); do_exit(0);
} }
int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) void io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
{ {
DEFINE_WAIT(wait); DEFINE_WAIT(wait);
@ -326,7 +330,6 @@ int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
} while (!signal_pending(current)); } while (!signal_pending(current));
finish_wait(&ctx->sqo_sq_wait, &wait); finish_wait(&ctx->sqo_sq_wait, &wait);
return 0;
} }
__cold int io_sq_offload_create(struct io_ring_ctx *ctx, __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
@ -418,3 +421,20 @@ err:
io_sq_thread_finish(ctx); io_sq_thread_finish(ctx);
return ret; return ret;
} }
__cold int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx,
cpumask_var_t mask)
{
struct io_sq_data *sqd = ctx->sq_data;
int ret = -EINVAL;
if (sqd) {
io_sq_thread_park(sqd);
/* Don't set affinity for a dying thread */
if (sqd->thread)
ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask);
io_sq_thread_unpark(sqd);
}
return ret;
}

View File

@ -26,4 +26,5 @@ void io_sq_thread_stop(struct io_sq_data *sqd);
void io_sq_thread_park(struct io_sq_data *sqd); void io_sq_thread_park(struct io_sq_data *sqd);
void io_sq_thread_unpark(struct io_sq_data *sqd); void io_sq_thread_unpark(struct io_sq_data *sqd);
void io_put_sq_data(struct io_sq_data *sqd); void io_put_sq_data(struct io_sq_data *sqd);
int io_sqpoll_wait_sq(struct io_ring_ctx *ctx); void io_sqpoll_wait_sq(struct io_ring_ctx *ctx);
int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, cpumask_var_t mask);

View File

@ -48,6 +48,7 @@ int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
} }
req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_NEED_CLEANUP;
req->flags |= REQ_F_FORCE_ASYNC;
return 0; return 0;
} }
@ -56,8 +57,7 @@ int io_statx(struct io_kiocb *req, unsigned int issue_flags)
struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx); struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx);
int ret; int ret;
if (issue_flags & IO_URING_F_NONBLOCK) WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
return -EAGAIN;
ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer); ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer);
io_req_set_res(req, ret, 0); io_req_set_res(req, ret, 0);

View File

@ -32,6 +32,8 @@ int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sync->off = READ_ONCE(sqe->off); sync->off = READ_ONCE(sqe->off);
sync->len = READ_ONCE(sqe->len); sync->len = READ_ONCE(sqe->len);
sync->flags = READ_ONCE(sqe->sync_range_flags); sync->flags = READ_ONCE(sqe->sync_range_flags);
req->flags |= REQ_F_FORCE_ASYNC;
return 0; return 0;
} }
@ -41,8 +43,7 @@ int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
int ret; int ret;
/* sync_file_range always requires a blocking context */ /* sync_file_range always requires a blocking context */
if (issue_flags & IO_URING_F_NONBLOCK) WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
return -EAGAIN;
ret = sync_file_range(req->file, sync->off, sync->len, sync->flags); ret = sync_file_range(req->file, sync->off, sync->len, sync->flags);
io_req_set_res(req, ret, 0); io_req_set_res(req, ret, 0);
@ -62,6 +63,7 @@ int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sync->off = READ_ONCE(sqe->off); sync->off = READ_ONCE(sqe->off);
sync->len = READ_ONCE(sqe->len); sync->len = READ_ONCE(sqe->len);
req->flags |= REQ_F_FORCE_ASYNC;
return 0; return 0;
} }
@ -72,8 +74,7 @@ int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
int ret; int ret;
/* fsync always requires a blocking context */ /* fsync always requires a blocking context */
if (issue_flags & IO_URING_F_NONBLOCK) WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
return -EAGAIN;
ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX, ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX,
sync->flags & IORING_FSYNC_DATASYNC); sync->flags & IORING_FSYNC_DATASYNC);
@ -91,6 +92,7 @@ int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sync->off = READ_ONCE(sqe->off); sync->off = READ_ONCE(sqe->off);
sync->len = READ_ONCE(sqe->addr); sync->len = READ_ONCE(sqe->addr);
sync->mode = READ_ONCE(sqe->len); sync->mode = READ_ONCE(sqe->len);
req->flags |= REQ_F_FORCE_ASYNC;
return 0; return 0;
} }
@ -100,8 +102,8 @@ int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
int ret; int ret;
/* fallocate always requiring blocking context */ /* fallocate always requiring blocking context */
if (issue_flags & IO_URING_F_NONBLOCK) WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
return -EAGAIN;
ret = vfs_fallocate(req->file, sync->mode, sync->off, sync->len); ret = vfs_fallocate(req->file, sync->mode, sync->off, sync->len);
if (ret >= 0) if (ret >= 0)
fsnotify_modify(req->file); fsnotify_modify(req->file);

View File

@ -83,7 +83,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
xa_init(&tctx->xa); xa_init(&tctx->xa);
init_waitqueue_head(&tctx->wait); init_waitqueue_head(&tctx->wait);
atomic_set(&tctx->in_idle, 0); atomic_set(&tctx->in_cancel, 0);
atomic_set(&tctx->inflight_tracked, 0); atomic_set(&tctx->inflight_tracked, 0);
task->io_uring = tctx; task->io_uring = tctx;
init_llist_head(&tctx->task_list); init_llist_head(&tctx->task_list);
@ -208,29 +208,38 @@ void io_uring_unreg_ringfd(void)
} }
} }
int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
int start, int end)
{
int offset;
for (offset = start; offset < end; offset++) {
offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
if (tctx->registered_rings[offset])
continue;
tctx->registered_rings[offset] = file;
return offset;
}
return -EBUSY;
}
static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
int start, int end) int start, int end)
{ {
struct file *file; struct file *file;
int offset; int offset;
for (offset = start; offset < end; offset++) { file = fget(fd);
offset = array_index_nospec(offset, IO_RINGFD_REG_MAX); if (!file) {
if (tctx->registered_rings[offset]) return -EBADF;
continue; } else if (!io_is_uring_fops(file)) {
fput(file);
file = fget(fd); return -EOPNOTSUPP;
if (!file) {
return -EBADF;
} else if (!io_is_uring_fops(file)) {
fput(file);
return -EOPNOTSUPP;
}
tctx->registered_rings[offset] = file;
return offset;
} }
offset = io_ring_add_registered_file(tctx, file, start, end);
return -EBUSY; if (offset < 0)
fput(file);
return offset;
} }
/* /*

View File

@ -17,6 +17,7 @@ struct io_timeout {
struct file *file; struct file *file;
u32 off; u32 off;
u32 target_seq; u32 target_seq;
u32 repeats;
struct list_head list; struct list_head list;
/* head of the link, used by linked timeouts only */ /* head of the link, used by linked timeouts only */
struct io_kiocb *head; struct io_kiocb *head;
@ -37,8 +38,9 @@ struct io_timeout_rem {
static inline bool io_is_timeout_noseq(struct io_kiocb *req) static inline bool io_is_timeout_noseq(struct io_kiocb *req)
{ {
struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
struct io_timeout_data *data = req->async_data;
return !timeout->off; return !timeout->off || data->flags & IORING_TIMEOUT_MULTISHOT;
} }
static inline void io_put_req(struct io_kiocb *req) static inline void io_put_req(struct io_kiocb *req)
@ -49,6 +51,44 @@ static inline void io_put_req(struct io_kiocb *req)
} }
} }
static inline bool io_timeout_finish(struct io_timeout *timeout,
struct io_timeout_data *data)
{
if (!(data->flags & IORING_TIMEOUT_MULTISHOT))
return true;
if (!timeout->off || (timeout->repeats && --timeout->repeats))
return false;
return true;
}
static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer);
static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts)
{
struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
struct io_timeout_data *data = req->async_data;
struct io_ring_ctx *ctx = req->ctx;
if (!io_timeout_finish(timeout, data)) {
bool filled;
filled = io_fill_cqe_req_aux(req, ts->locked, -ETIME,
IORING_CQE_F_MORE);
if (filled) {
/* re-arm timer */
spin_lock_irq(&ctx->timeout_lock);
list_add(&timeout->list, ctx->timeout_list.prev);
data->timer.function = io_timeout_fn;
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
spin_unlock_irq(&ctx->timeout_lock);
return;
}
}
io_req_task_complete(req, ts);
}
static bool io_kill_timeout(struct io_kiocb *req, int status) static bool io_kill_timeout(struct io_kiocb *req, int status)
__must_hold(&req->ctx->timeout_lock) __must_hold(&req->ctx->timeout_lock)
{ {
@ -101,9 +141,9 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
spin_unlock_irq(&ctx->timeout_lock); spin_unlock_irq(&ctx->timeout_lock);
} }
static void io_req_tw_fail_links(struct io_kiocb *link, bool *locked) static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts)
{ {
io_tw_lock(link->ctx, locked); io_tw_lock(link->ctx, ts);
while (link) { while (link) {
struct io_kiocb *nxt = link->link; struct io_kiocb *nxt = link->link;
long res = -ECANCELED; long res = -ECANCELED;
@ -112,7 +152,7 @@ static void io_req_tw_fail_links(struct io_kiocb *link, bool *locked)
res = link->cqe.res; res = link->cqe.res;
link->link = NULL; link->link = NULL;
io_req_set_res(link, res, 0); io_req_set_res(link, res, 0);
io_req_task_complete(link, locked); io_req_task_complete(link, ts);
link = nxt; link = nxt;
} }
} }
@ -212,7 +252,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
req_set_fail(req); req_set_fail(req);
io_req_set_res(req, -ETIME, 0); io_req_set_res(req, -ETIME, 0);
req->io_task_work.func = io_req_task_complete; req->io_task_work.func = io_timeout_complete;
io_req_task_work_add(req); io_req_task_work_add(req);
return HRTIMER_NORESTART; return HRTIMER_NORESTART;
} }
@ -228,16 +268,10 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
list_for_each_entry(timeout, &ctx->timeout_list, list) { list_for_each_entry(timeout, &ctx->timeout_list, list) {
struct io_kiocb *tmp = cmd_to_io_kiocb(timeout); struct io_kiocb *tmp = cmd_to_io_kiocb(timeout);
if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && if (io_cancel_req_match(tmp, cd)) {
cd->data != tmp->cqe.user_data) req = tmp;
continue; break;
if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
if (cd->seq == tmp->work.cancel_seq)
continue;
tmp->work.cancel_seq = cd->seq;
} }
req = tmp;
break;
} }
if (!req) if (!req)
return ERR_PTR(-ENOENT); return ERR_PTR(-ENOENT);
@ -265,9 +299,9 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
return 0; return 0;
} }
static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *ts)
{ {
unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED; unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED;
struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
struct io_kiocb *prev = timeout->prev; struct io_kiocb *prev = timeout->prev;
int ret = -ENOENT; int ret = -ENOENT;
@ -282,11 +316,11 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
ret = io_try_cancel(req->task->io_uring, &cd, issue_flags); ret = io_try_cancel(req->task->io_uring, &cd, issue_flags);
} }
io_req_set_res(req, ret ?: -ETIME, 0); io_req_set_res(req, ret ?: -ETIME, 0);
io_req_task_complete(req, locked); io_req_task_complete(req, ts);
io_put_req(prev); io_put_req(prev);
} else { } else {
io_req_set_res(req, -ETIME, 0); io_req_set_res(req, -ETIME, 0);
io_req_task_complete(req, locked); io_req_task_complete(req, ts);
} }
} }
@ -369,7 +403,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
struct timespec64 *ts, enum hrtimer_mode mode) struct timespec64 *ts, enum hrtimer_mode mode)
__must_hold(&ctx->timeout_lock) __must_hold(&ctx->timeout_lock)
{ {
struct io_cancel_data cd = { .data = user_data, }; struct io_cancel_data cd = { .ctx = ctx, .data = user_data, };
struct io_kiocb *req = io_timeout_extract(ctx, &cd); struct io_kiocb *req = io_timeout_extract(ctx, &cd);
struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
struct io_timeout_data *data; struct io_timeout_data *data;
@ -433,7 +467,7 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
int ret; int ret;
if (!(tr->flags & IORING_TIMEOUT_UPDATE)) { if (!(tr->flags & IORING_TIMEOUT_UPDATE)) {
struct io_cancel_data cd = { .data = tr->addr, }; struct io_cancel_data cd = { .ctx = ctx, .data = tr->addr, };
spin_lock(&ctx->completion_lock); spin_lock(&ctx->completion_lock);
ret = io_timeout_cancel(ctx, &cd); ret = io_timeout_cancel(ctx, &cd);
@ -470,16 +504,27 @@ static int __io_timeout_prep(struct io_kiocb *req,
return -EINVAL; return -EINVAL;
flags = READ_ONCE(sqe->timeout_flags); flags = READ_ONCE(sqe->timeout_flags);
if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK | if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
IORING_TIMEOUT_ETIME_SUCCESS)) IORING_TIMEOUT_ETIME_SUCCESS |
IORING_TIMEOUT_MULTISHOT))
return -EINVAL; return -EINVAL;
/* more than one clock specified is invalid, obviously */ /* more than one clock specified is invalid, obviously */
if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
return -EINVAL; return -EINVAL;
/* multishot requests only make sense with rel values */
if (!(~flags & (IORING_TIMEOUT_MULTISHOT | IORING_TIMEOUT_ABS)))
return -EINVAL;
INIT_LIST_HEAD(&timeout->list); INIT_LIST_HEAD(&timeout->list);
timeout->off = off; timeout->off = off;
if (unlikely(off && !req->ctx->off_timeout_used)) if (unlikely(off && !req->ctx->off_timeout_used))
req->ctx->off_timeout_used = true; req->ctx->off_timeout_used = true;
/*
* for multishot reqs w/ fixed nr of repeats, repeats tracks the
* remaining nr
*/
timeout->repeats = 0;
if ((flags & IORING_TIMEOUT_MULTISHOT) && off > 0)
timeout->repeats = off;
if (WARN_ON_ONCE(req_has_async_data(req))) if (WARN_ON_ONCE(req_has_async_data(req)))
return -EFAULT; return -EFAULT;
@ -543,7 +588,7 @@ int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
goto add; goto add;
} }
tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); tail = data_race(ctx->cached_cq_tail) - atomic_read(&ctx->cq_timeouts);
timeout->target_seq = tail + off; timeout->target_seq = tail + off;
/* Update the last seq here in case io_flush_timeouts() hasn't. /* Update the last seq here in case io_flush_timeouts() hasn't.

View File

@ -7,36 +7,44 @@
#include <linux/nospec.h> #include <linux/nospec.h>
#include <uapi/linux/io_uring.h> #include <uapi/linux/io_uring.h>
#include <uapi/asm-generic/ioctls.h>
#include "io_uring.h" #include "io_uring.h"
#include "rsrc.h" #include "rsrc.h"
#include "uring_cmd.h" #include "uring_cmd.h"
static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
{ {
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED; unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED;
ioucmd->task_work_cb(ioucmd, issue_flags); ioucmd->task_work_cb(ioucmd, issue_flags);
} }
void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned)) void (*task_work_cb)(struct io_uring_cmd *, unsigned),
unsigned flags)
{ {
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
ioucmd->task_work_cb = task_work_cb; ioucmd->task_work_cb = task_work_cb;
req->io_task_work.func = io_uring_cmd_work; req->io_task_work.func = io_uring_cmd_work;
io_req_task_work_add(req); __io_req_task_work_add(req, flags);
} }
EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task); EXPORT_SYMBOL_GPL(__io_uring_cmd_do_in_task);
void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned))
{
__io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE);
}
EXPORT_SYMBOL_GPL(io_uring_cmd_do_in_task_lazy);
static inline void io_req_set_cqe32_extra(struct io_kiocb *req, static inline void io_req_set_cqe32_extra(struct io_kiocb *req,
u64 extra1, u64 extra2) u64 extra1, u64 extra2)
{ {
req->extra1 = extra1; req->big_cqe.extra1 = extra1;
req->extra2 = extra2; req->big_cqe.extra2 = extra2;
req->flags |= REQ_F_CQE32_INIT;
} }
/* /*
@ -54,25 +62,24 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2,
io_req_set_res(req, ret, 0); io_req_set_res(req, ret, 0);
if (req->ctx->flags & IORING_SETUP_CQE32) if (req->ctx->flags & IORING_SETUP_CQE32)
io_req_set_cqe32_extra(req, res2, 0); io_req_set_cqe32_extra(req, res2, 0);
if (req->ctx->flags & IORING_SETUP_IOPOLL) if (req->ctx->flags & IORING_SETUP_IOPOLL) {
/* order with io_iopoll_req_issued() checking ->iopoll_complete */ /* order with io_iopoll_req_issued() checking ->iopoll_complete */
smp_store_release(&req->iopoll_completed, 1); smp_store_release(&req->iopoll_completed, 1);
else } else {
io_req_complete_post(req, issue_flags); struct io_tw_state ts = {
.locked = !(issue_flags & IO_URING_F_UNLOCKED),
};
io_req_task_complete(req, &ts);
}
} }
EXPORT_SYMBOL_GPL(io_uring_cmd_done); EXPORT_SYMBOL_GPL(io_uring_cmd_done);
int io_uring_cmd_prep_async(struct io_kiocb *req) int io_uring_cmd_prep_async(struct io_kiocb *req)
{ {
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
size_t cmd_size;
BUILD_BUG_ON(uring_cmd_pdu_size(0) != 16); memcpy(req->async_data, ioucmd->sqe, uring_sqe_size(req->ctx));
BUILD_BUG_ON(uring_cmd_pdu_size(1) != 80); ioucmd->sqe = req->async_data;
cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
memcpy(req->async_data, ioucmd->cmd, cmd_size);
return 0; return 0;
} }
@ -98,7 +105,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->imu = ctx->user_bufs[index]; req->imu = ctx->user_bufs[index];
io_req_set_rsrc_node(req, ctx, 0); io_req_set_rsrc_node(req, ctx, 0);
} }
ioucmd->cmd = sqe->cmd; ioucmd->sqe = sqe;
ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
return 0; return 0;
} }
@ -129,9 +136,6 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
WRITE_ONCE(ioucmd->cookie, NULL); WRITE_ONCE(ioucmd->cookie, NULL);
} }
if (req_has_async_data(req))
ioucmd->cmd = req->async_data;
ret = file->f_op->uring_cmd(ioucmd, issue_flags); ret = file->f_op->uring_cmd(ioucmd, issue_flags);
if (ret == -EAGAIN) { if (ret == -EAGAIN) {
if (!req_has_async_data(req)) { if (!req_has_async_data(req)) {
@ -160,3 +164,30 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
return io_import_fixed(rw, iter, req->imu, ubuf, len); return io_import_fixed(rw, iter, req->imu, ubuf, len);
} }
EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed); EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed);
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct socket *sock = cmd->file->private_data;
struct sock *sk = sock->sk;
struct proto *prot = READ_ONCE(sk->sk_prot);
int ret, arg = 0;
if (!prot || !prot->ioctl)
return -EOPNOTSUPP;
switch (cmd->sqe->cmd_op) {
case SOCKET_URING_OP_SIOCINQ:
ret = prot->ioctl(sk, SIOCINQ, &arg);
if (ret)
return ret;
return arg;
case SOCKET_URING_OP_SIOCOUTQ:
ret = prot->ioctl(sk, SIOCOUTQ, &arg);
if (ret)
return ret;
return arg;
default:
return -EOPNOTSUPP;
}
}
EXPORT_SYMBOL_GPL(io_uring_cmd_sock);

View File

@ -3,11 +3,3 @@
int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags);
int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_uring_cmd_prep_async(struct io_kiocb *req); int io_uring_cmd_prep_async(struct io_kiocb *req);
/*
* The URING_CMD payload starts at 'cmd' in the first sqe, and continues into
* the following sqe if SQE128 is used.
*/
#define uring_cmd_pdu_size(is_sqe128) \
((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) - \
offsetof(struct io_uring_sqe, cmd))

View File

@ -75,6 +75,7 @@ static int __io_getxattr_prep(struct io_kiocb *req,
} }
req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_NEED_CLEANUP;
req->flags |= REQ_F_FORCE_ASYNC;
return 0; return 0;
} }
@ -109,8 +110,7 @@ int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr); struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr);
int ret; int ret;
if (issue_flags & IO_URING_F_NONBLOCK) WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
return -EAGAIN;
ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt), ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt),
req->file->f_path.dentry, req->file->f_path.dentry,
@ -127,8 +127,7 @@ int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
struct path path; struct path path;
int ret; int ret;
if (issue_flags & IO_URING_F_NONBLOCK) WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
return -EAGAIN;
retry: retry:
ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
@ -176,6 +175,7 @@ static int __io_setxattr_prep(struct io_kiocb *req,
} }
req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_NEED_CLEANUP;
req->flags |= REQ_F_FORCE_ASYNC;
return 0; return 0;
} }
@ -224,8 +224,7 @@ int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
{ {
int ret; int ret;
if (issue_flags & IO_URING_F_NONBLOCK) WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
return -EAGAIN;
ret = __io_setxattr(req, issue_flags, &req->file->f_path); ret = __io_setxattr(req, issue_flags, &req->file->f_path);
io_xattr_finish(req, ret); io_xattr_finish(req, ret);
@ -239,8 +238,7 @@ int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
struct path path; struct path path;
int ret; int ret;
if (issue_flags & IO_URING_F_NONBLOCK) WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
return -EAGAIN;
retry: retry:
ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);

View File

@ -126,13 +126,13 @@ __out: \
iterate_buf(i, n, base, len, off, \ iterate_buf(i, n, base, len, off, \
i->ubuf, (I)) \ i->ubuf, (I)) \
} else if (likely(iter_is_iovec(i))) { \ } else if (likely(iter_is_iovec(i))) { \
const struct iovec *iov = i->iov; \ const struct iovec *iov = iter_iov(i); \
void __user *base; \ void __user *base; \
size_t len; \ size_t len; \
iterate_iovec(i, n, base, len, off, \ iterate_iovec(i, n, base, len, off, \
iov, (I)) \ iov, (I)) \
i->nr_segs -= iov - i->iov; \ i->nr_segs -= iov - iter_iov(i); \
i->iov = iov; \ i->__iov = iov; \
} else if (iov_iter_is_bvec(i)) { \ } else if (iov_iter_is_bvec(i)) { \
const struct bio_vec *bvec = i->bvec; \ const struct bio_vec *bvec = i->bvec; \
void *base; \ void *base; \
@ -361,7 +361,7 @@ size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size)
size_t skip; size_t skip;
size -= count; size -= count;
for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) { for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
size_t len = min(count, p->iov_len - skip); size_t len = min(count, p->iov_len - skip);
size_t ret; size_t ret;
@ -404,7 +404,7 @@ size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
size_t skip; size_t skip;
size -= count; size -= count;
for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) { for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
size_t len = min(count, p->iov_len - skip); size_t len = min(count, p->iov_len - skip);
size_t ret; size_t ret;
@ -431,7 +431,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
.nofault = false, .nofault = false,
.user_backed = true, .user_backed = true,
.data_source = direction, .data_source = direction,
.iov = iov, .__iov = iov,
.nr_segs = nr_segs, .nr_segs = nr_segs,
.iov_offset = 0, .iov_offset = 0,
.count = count .count = count
@ -881,14 +881,14 @@ static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
i->count -= size; i->count -= size;
size += i->iov_offset; // from beginning of current segment size += i->iov_offset; // from beginning of current segment
for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) { for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) {
if (likely(size < iov->iov_len)) if (likely(size < iov->iov_len))
break; break;
size -= iov->iov_len; size -= iov->iov_len;
} }
i->iov_offset = size; i->iov_offset = size;
i->nr_segs -= iov - i->iov; i->nr_segs -= iov - iter_iov(i);
i->iov = iov; i->__iov = iov;
} }
void iov_iter_advance(struct iov_iter *i, size_t size) void iov_iter_advance(struct iov_iter *i, size_t size)
@ -963,12 +963,12 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll)
unroll -= n; unroll -= n;
} }
} else { /* same logics for iovec and kvec */ } else { /* same logics for iovec and kvec */
const struct iovec *iov = i->iov; const struct iovec *iov = iter_iov(i);
while (1) { while (1) {
size_t n = (--iov)->iov_len; size_t n = (--iov)->iov_len;
i->nr_segs++; i->nr_segs++;
if (unroll <= n) { if (unroll <= n) {
i->iov = iov; i->__iov = iov;
i->iov_offset = n - unroll; i->iov_offset = n - unroll;
return; return;
} }
@ -985,7 +985,7 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i)
{ {
if (i->nr_segs > 1) { if (i->nr_segs > 1) {
if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
return min(i->count, i->iov->iov_len - i->iov_offset); return min(i->count, iter_iov(i)->iov_len - i->iov_offset);
if (iov_iter_is_bvec(i)) if (iov_iter_is_bvec(i))
return min(i->count, i->bvec->bv_len - i->iov_offset); return min(i->count, i->bvec->bv_len - i->iov_offset);
} }
@ -1100,13 +1100,14 @@ static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask,
unsigned k; unsigned k;
for (k = 0; k < i->nr_segs; k++, skip = 0) { for (k = 0; k < i->nr_segs; k++, skip = 0) {
size_t len = i->iov[k].iov_len - skip; const struct iovec *iov = iter_iov(i) + k;
size_t len = iov->iov_len - skip;
if (len > size) if (len > size)
len = size; len = size;
if (len & len_mask) if (len & len_mask)
return false; return false;
if ((unsigned long)(i->iov[k].iov_base + skip) & addr_mask) if ((unsigned long)(iov->iov_base + skip) & addr_mask)
return false; return false;
size -= len; size -= len;
@ -1199,9 +1200,10 @@ static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
unsigned k; unsigned k;
for (k = 0; k < i->nr_segs; k++, skip = 0) { for (k = 0; k < i->nr_segs; k++, skip = 0) {
size_t len = i->iov[k].iov_len - skip; const struct iovec *iov = iter_iov(i) + k;
size_t len = iov->iov_len - skip;
if (len) { if (len) {
res |= (unsigned long)i->iov[k].iov_base + skip; res |= (unsigned long)iov->iov_base + skip;
if (len > size) if (len > size)
len = size; len = size;
res |= len; res |= len;
@ -1278,14 +1280,15 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
return ~0U; return ~0U;
for (k = 0; k < i->nr_segs; k++) { for (k = 0; k < i->nr_segs; k++) {
if (i->iov[k].iov_len) { const struct iovec *iov = iter_iov(i) + k;
unsigned long base = (unsigned long)i->iov[k].iov_base; if (iov->iov_len) {
unsigned long base = (unsigned long)iov->iov_base;
if (v) // if not the first one if (v) // if not the first one
res |= base | v; // this start | previous end res |= base | v; // this start | previous end
v = base + i->iov[k].iov_len; v = base + iov->iov_len;
if (size <= i->iov[k].iov_len) if (size <= iov->iov_len)
break; break;
size -= i->iov[k].iov_len; size -= iov->iov_len;
} }
} }
return res; return res;
@ -1401,13 +1404,14 @@ static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size)
return (unsigned long)i->ubuf + i->iov_offset; return (unsigned long)i->ubuf + i->iov_offset;
for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
size_t len = i->iov[k].iov_len - skip; const struct iovec *iov = iter_iov(i) + k;
size_t len = iov->iov_len - skip;
if (unlikely(!len)) if (unlikely(!len))
continue; continue;
if (*size > len) if (*size > len)
*size = len; *size = len;
return (unsigned long)i->iov[k].iov_base + skip; return (unsigned long)iov->iov_base + skip;
} }
BUG(); // if it had been empty, we wouldn't get called BUG(); // if it had been empty, we wouldn't get called
} }
@ -1596,7 +1600,7 @@ static int iov_npages(const struct iov_iter *i, int maxpages)
const struct iovec *p; const struct iovec *p;
int npages = 0; int npages = 0;
for (p = i->iov; size; skip = 0, p++) { for (p = iter_iov(i); size; skip = 0, p++) {
unsigned offs = offset_in_page(p->iov_base + skip); unsigned offs = offset_in_page(p->iov_base + skip);
size_t len = min(p->iov_len - skip, size); size_t len = min(p->iov_len - skip, size);
@ -1673,7 +1677,7 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
flags); flags);
else if (iov_iter_is_kvec(new) || iter_is_iovec(new)) else if (iov_iter_is_kvec(new) || iter_is_iovec(new))
/* iovec and kvec have identical layout */ /* iovec and kvec have identical layout */
return new->iov = kmemdup(new->iov, return new->__iov = kmemdup(new->__iov,
new->nr_segs * sizeof(struct iovec), new->nr_segs * sizeof(struct iovec),
flags); flags);
return NULL; return NULL;
@ -1855,6 +1859,17 @@ int import_single_range(int rw, void __user *buf, size_t len,
} }
EXPORT_SYMBOL(import_single_range); EXPORT_SYMBOL(import_single_range);
int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i)
{
if (len > MAX_RW_COUNT)
len = MAX_RW_COUNT;
if (unlikely(!access_ok(buf, len)))
return -EFAULT;
iov_iter_ubuf(i, rw, buf, len);
return 0;
}
/** /**
* iov_iter_restore() - Restore a &struct iov_iter to the same state as when * iov_iter_restore() - Restore a &struct iov_iter to the same state as when
* iov_iter_save_state() was called. * iov_iter_save_state() was called.
@ -1869,8 +1884,8 @@ EXPORT_SYMBOL(import_single_range);
*/ */
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
{ {
if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) && if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) &&
!iov_iter_is_kvec(i) && !iter_is_ubuf(i)) !iter_is_ubuf(i)) && !iov_iter_is_kvec(i))
return; return;
i->iov_offset = state->iov_offset; i->iov_offset = state->iov_offset;
i->count = state->count; i->count = state->count;
@ -1889,6 +1904,6 @@ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
if (iov_iter_is_bvec(i)) if (iov_iter_is_bvec(i))
i->bvec -= state->nr_segs - i->nr_segs; i->bvec -= state->nr_segs - i->nr_segs;
else else
i->iov -= state->nr_segs - i->nr_segs; i->__iov -= state->nr_segs - i->nr_segs;
i->nr_segs = state->nr_segs; i->nr_segs = state->nr_segs;
} }

View File

@ -1478,7 +1478,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
size_t, vlen, int, behavior, unsigned int, flags) size_t, vlen, int, behavior, unsigned int, flags)
{ {
ssize_t ret; ssize_t ret;
struct iovec iovstack[UIO_FASTIOV], iovec; struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack; struct iovec *iov = iovstack;
struct iov_iter iter; struct iov_iter iter;
struct task_struct *task; struct task_struct *task;
@ -1525,12 +1525,11 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
total_len = iov_iter_count(&iter); total_len = iov_iter_count(&iter);
while (iov_iter_count(&iter)) { while (iov_iter_count(&iter)) {
iovec = iov_iter_iovec(&iter); ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
ret = do_madvise(mm, (unsigned long)iovec.iov_base, iter_iov_len(&iter), behavior);
iovec.iov_len, behavior);
if (ret < 0) if (ret < 0)
break; break;
iov_iter_advance(&iter, iovec.iov_len); iov_iter_advance(&iter, iter_iov_len(&iter));
} }
ret = (total_len - iov_iter_count(&iter)) ? : ret; ret = (total_len - iov_iter_count(&iter)) ? : ret;

View File

@ -975,9 +975,10 @@ static int do_mmap_private(struct vm_area_struct *vma,
*/ */
if (capabilities & NOMMU_MAP_DIRECT) { if (capabilities & NOMMU_MAP_DIRECT) {
ret = call_mmap(vma->vm_file, vma); ret = call_mmap(vma->vm_file, vma);
/* shouldn't return success if we're not sharing */
if (WARN_ON_ONCE(!is_nommu_shared_mapping(vma->vm_flags)))
ret = -ENOSYS;
if (ret == 0) { if (ret == 0) {
/* shouldn't return success if we're not sharing */
BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
vma->vm_region->vm_top = vma->vm_region->vm_end; vma->vm_region->vm_top = vma->vm_region->vm_end;
return 0; return 0;
} }
@ -1118,7 +1119,7 @@ unsigned long do_mmap(struct file *file,
* these cases, sharing is handled in the driver or filesystem rather * these cases, sharing is handled in the driver or filesystem rather
* than here * than here
*/ */
if (vm_flags & VM_MAYSHARE) { if (is_nommu_shared_mapping(vm_flags)) {
struct vm_region *pregion; struct vm_region *pregion;
unsigned long pglen, rpglen, pgend, rpgend, start; unsigned long pglen, rpglen, pgend, rpgend, start;
@ -1128,7 +1129,7 @@ unsigned long do_mmap(struct file *file,
for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) { for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) {
pregion = rb_entry(rb, struct vm_region, vm_rb); pregion = rb_entry(rb, struct vm_region, vm_rb);
if (!(pregion->vm_flags & VM_MAYSHARE)) if (!is_nommu_shared_mapping(pregion->vm_flags))
continue; continue;
/* search for overlapping mappings on the same file */ /* search for overlapping mappings on the same file */
@ -1575,7 +1576,7 @@ static unsigned long do_mremap(unsigned long addr,
if (vma->vm_end != vma->vm_start + old_len) if (vma->vm_end != vma->vm_start + old_len)
return (unsigned long) -EFAULT; return (unsigned long) -EFAULT;
if (vma->vm_flags & VM_MAYSHARE) if (is_nommu_shared_mapping(vma->vm_flags))
return (unsigned long) -EPERM; return (unsigned long) -EPERM;
if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start) if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start)

View File

@ -114,6 +114,9 @@
#include <linux/memcontrol.h> #include <linux/memcontrol.h>
#include <linux/prefetch.h> #include <linux/prefetch.h>
#include <linux/compat.h> #include <linux/compat.h>
#include <linux/mroute.h>
#include <linux/mroute6.h>
#include <linux/icmpv6.h>
#include <linux/uaccess.h> #include <linux/uaccess.h>
@ -138,6 +141,7 @@
#include <net/tcp.h> #include <net/tcp.h>
#include <net/busy_poll.h> #include <net/busy_poll.h>
#include <net/phonet/phonet.h>
#include <linux/ethtool.h> #include <linux/ethtool.h>
@ -4028,3 +4032,63 @@ int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
return sk->sk_prot->bind_add(sk, addr, addr_len); return sk->sk_prot->bind_add(sk, addr, addr_len);
} }
EXPORT_SYMBOL(sock_bind_add); EXPORT_SYMBOL(sock_bind_add);
/* Copy 'size' bytes from userspace and return `size` back to userspace */
int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
void __user *arg, void *karg, size_t size)
{
int ret;
if (copy_from_user(karg, arg, size))
return -EFAULT;
ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
if (ret)
return ret;
if (copy_to_user(arg, karg, size))
return -EFAULT;
return 0;
}
EXPORT_SYMBOL(sock_ioctl_inout);
/* This is the most common ioctl prep function, where the result (4 bytes) is
* copied back to userspace if the ioctl() returns successfully. No input is
* copied from userspace as input argument.
*/
static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
{
int ret, karg = 0;
ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
if (ret)
return ret;
return put_user(karg, (int __user *)arg);
}
/* A wrapper around sock ioctls, which copies the data from userspace
* (depending on the protocol/ioctl), and copies back the result to userspace.
* The main motivation for this function is to pass kernel memory to the
* protocol ioctl callbacks, instead of userspace memory.
*/
int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
{
int rc = 1;
if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
rc = ipmr_sk_ioctl(sk, cmd, arg);
else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
rc = ip6mr_sk_ioctl(sk, cmd, arg);
else if (sk_is_phonet(sk))
rc = phonet_sk_ioctl(sk, cmd, arg);
/* If ioctl was processed, returns its value */
if (rc <= 0)
return rc;
/* Otherwise call the default handler */
return sock_ioctl_out(sk, cmd, arg);
}
EXPORT_SYMBOL(sk_ioctl);

View File

@ -297,7 +297,7 @@ int dccp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen); char __user *optval, int __user *optlen);
int dccp_setsockopt(struct sock *sk, int level, int optname, int dccp_setsockopt(struct sock *sk, int level, int optname,
sockptr_t optval, unsigned int optlen); sockptr_t optval, unsigned int optlen);
int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg); int dccp_ioctl(struct sock *sk, int cmd, int *karg);
int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
int flags, int *addr_len); int flags, int *addr_len);

View File

@ -371,7 +371,7 @@ __poll_t dccp_poll(struct file *file, struct socket *sock,
EXPORT_SYMBOL_GPL(dccp_poll); EXPORT_SYMBOL_GPL(dccp_poll);
int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) int dccp_ioctl(struct sock *sk, int cmd, int *karg)
{ {
int rc = -ENOTCONN; int rc = -ENOTCONN;
@ -382,17 +382,17 @@ int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
switch (cmd) { switch (cmd) {
case SIOCOUTQ: { case SIOCOUTQ: {
int amount = sk_wmem_alloc_get(sk); *karg = sk_wmem_alloc_get(sk);
/* Using sk_wmem_alloc here because sk_wmem_queued is not used by DCCP and /* Using sk_wmem_alloc here because sk_wmem_queued is not used by DCCP and
* always 0, comparably to UDP. * always 0, comparably to UDP.
*/ */
rc = put_user(amount, (int __user *)arg); rc = 0;
} }
break; break;
case SIOCINQ: { case SIOCINQ: {
struct sk_buff *skb; struct sk_buff *skb;
unsigned long amount = 0; *karg = 0;
skb = skb_peek(&sk->sk_receive_queue); skb = skb_peek(&sk->sk_receive_queue);
if (skb != NULL) { if (skb != NULL) {
@ -400,9 +400,9 @@ int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
* We will only return the amount of this packet since * We will only return the amount of this packet since
* that is all that will be read. * that is all that will be read.
*/ */
amount = skb->len; *karg = skb->len;
} }
rc = put_user(amount, (int __user *)arg); rc = 0;
} }
break; break;
default: default:

View File

@ -162,7 +162,7 @@ static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd,
default: default:
if (!sk->sk_prot->ioctl) if (!sk->sk_prot->ioctl)
return -ENOIOCTLCMD; return -ENOIOCTLCMD;
return sk->sk_prot->ioctl(sk, cmd, arg); return sk_ioctl(sk, cmd, (void __user *)arg);
} }
} }
@ -524,22 +524,21 @@ out:
return err; return err;
} }
static int dgram_ioctl(struct sock *sk, int cmd, unsigned long arg) static int dgram_ioctl(struct sock *sk, int cmd, int *karg)
{ {
switch (cmd) { switch (cmd) {
case SIOCOUTQ: case SIOCOUTQ:
{ {
int amount = sk_wmem_alloc_get(sk); *karg = sk_wmem_alloc_get(sk);
return put_user(amount, (int __user *)arg); return 0;
} }
case SIOCINQ: case SIOCINQ:
{ {
struct sk_buff *skb; struct sk_buff *skb;
unsigned long amount;
amount = 0; *karg = 0;
spin_lock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_receive_queue.lock);
skb = skb_peek(&sk->sk_receive_queue); skb = skb_peek(&sk->sk_receive_queue);
if (skb) { if (skb) {
@ -547,10 +546,10 @@ static int dgram_ioctl(struct sock *sk, int cmd, unsigned long arg)
* of this packet since that is all * of this packet since that is all
* that will be read. * that will be read.
*/ */
amount = skb->len - ieee802154_hdr_length(skb); *karg = skb->len - ieee802154_hdr_length(skb);
} }
spin_unlock_bh(&sk->sk_receive_queue.lock); spin_unlock_bh(&sk->sk_receive_queue.lock);
return put_user(amount, (int __user *)arg); return 0;
} }
} }

View File

@ -1004,7 +1004,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
break; break;
default: default:
if (sk->sk_prot->ioctl) if (sk->sk_prot->ioctl)
err = sk->sk_prot->ioctl(sk, cmd, arg); err = sk_ioctl(sk, cmd, (void __user *)arg);
else else
err = -ENOIOCTLCMD; err = -ENOIOCTLCMD;
break; break;

View File

@ -1540,6 +1540,28 @@ out:
return ret; return ret;
} }
/* Execute if this ioctl is a special mroute ioctl */
int ipmr_sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
{
switch (cmd) {
/* These userspace buffers will be consumed by ipmr_ioctl() */
case SIOCGETVIFCNT: {
struct sioc_vif_req buffer;
return sock_ioctl_inout(sk, cmd, arg, &buffer,
sizeof(buffer));
}
case SIOCGETSGCNT: {
struct sioc_sg_req buffer;
return sock_ioctl_inout(sk, cmd, arg, &buffer,
sizeof(buffer));
}
}
/* return code > 0 means that the ioctl was not executed */
return 1;
}
/* Getsock opt support for the multicast routing system. */ /* Getsock opt support for the multicast routing system. */
int ip_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, int ip_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval,
sockptr_t optlen) sockptr_t optlen)
@ -1586,13 +1608,13 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval,
} }
/* The IP multicast ioctl support routines. */ /* The IP multicast ioctl support routines. */
int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) int ipmr_ioctl(struct sock *sk, int cmd, void *arg)
{ {
struct sioc_sg_req sr;
struct sioc_vif_req vr;
struct vif_device *vif; struct vif_device *vif;
struct mfc_cache *c; struct mfc_cache *c;
struct net *net = sock_net(sk); struct net *net = sock_net(sk);
struct sioc_vif_req *vr;
struct sioc_sg_req *sr;
struct mr_table *mrt; struct mr_table *mrt;
mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT); mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
@ -1601,40 +1623,33 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
switch (cmd) { switch (cmd) {
case SIOCGETVIFCNT: case SIOCGETVIFCNT:
if (copy_from_user(&vr, arg, sizeof(vr))) vr = (struct sioc_vif_req *)arg;
return -EFAULT; if (vr->vifi >= mrt->maxvif)
if (vr.vifi >= mrt->maxvif)
return -EINVAL; return -EINVAL;
vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif); vr->vifi = array_index_nospec(vr->vifi, mrt->maxvif);
read_lock(&mrt_lock); read_lock(&mrt_lock);
vif = &mrt->vif_table[vr.vifi]; vif = &mrt->vif_table[vr->vifi];
if (VIF_EXISTS(mrt, vr.vifi)) { if (VIF_EXISTS(mrt, vr->vifi)) {
vr.icount = vif->pkt_in; vr->icount = vif->pkt_in;
vr.ocount = vif->pkt_out; vr->ocount = vif->pkt_out;
vr.ibytes = vif->bytes_in; vr->ibytes = vif->bytes_in;
vr.obytes = vif->bytes_out; vr->obytes = vif->bytes_out;
read_unlock(&mrt_lock); read_unlock(&mrt_lock);
if (copy_to_user(arg, &vr, sizeof(vr)))
return -EFAULT;
return 0; return 0;
} }
read_unlock(&mrt_lock); read_unlock(&mrt_lock);
return -EADDRNOTAVAIL; return -EADDRNOTAVAIL;
case SIOCGETSGCNT: case SIOCGETSGCNT:
if (copy_from_user(&sr, arg, sizeof(sr))) sr = (struct sioc_sg_req *)arg;
return -EFAULT;
rcu_read_lock(); rcu_read_lock();
c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); c = ipmr_cache_find(mrt, sr->src.s_addr, sr->grp.s_addr);
if (c) { if (c) {
sr.pktcnt = c->_c.mfc_un.res.pkt; sr->pktcnt = c->_c.mfc_un.res.pkt;
sr.bytecnt = c->_c.mfc_un.res.bytes; sr->bytecnt = c->_c.mfc_un.res.bytes;
sr.wrong_if = c->_c.mfc_un.res.wrong_if; sr->wrong_if = c->_c.mfc_un.res.wrong_if;
rcu_read_unlock(); rcu_read_unlock();
if (copy_to_user(arg, &sr, sizeof(sr)))
return -EFAULT;
return 0; return 0;
} }
rcu_read_unlock(); rcu_read_unlock();

View File

@ -855,29 +855,29 @@ static int raw_getsockopt(struct sock *sk, int level, int optname,
return do_raw_getsockopt(sk, level, optname, optval, optlen); return do_raw_getsockopt(sk, level, optname, optval, optlen);
} }
static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg) static int raw_ioctl(struct sock *sk, int cmd, int *karg)
{ {
switch (cmd) { switch (cmd) {
case SIOCOUTQ: { case SIOCOUTQ: {
int amount = sk_wmem_alloc_get(sk); *karg = sk_wmem_alloc_get(sk);
return 0;
return put_user(amount, (int __user *)arg);
} }
case SIOCINQ: { case SIOCINQ: {
struct sk_buff *skb; struct sk_buff *skb;
int amount = 0;
spin_lock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_receive_queue.lock);
skb = skb_peek(&sk->sk_receive_queue); skb = skb_peek(&sk->sk_receive_queue);
if (skb) if (skb)
amount = skb->len; *karg = skb->len;
else
*karg = 0;
spin_unlock_bh(&sk->sk_receive_queue.lock); spin_unlock_bh(&sk->sk_receive_queue.lock);
return put_user(amount, (int __user *)arg); return 0;
} }
default: default:
#ifdef CONFIG_IP_MROUTE #ifdef CONFIG_IP_MROUTE
return ipmr_ioctl(sk, cmd, (void __user *)arg); return ipmr_ioctl(sk, cmd, karg);
#else #else
return -ENOIOCTLCMD; return -ENOIOCTLCMD;
#endif #endif

View File

@ -596,7 +596,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
} }
EXPORT_SYMBOL(tcp_poll); EXPORT_SYMBOL(tcp_poll);
int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) int tcp_ioctl(struct sock *sk, int cmd, int *karg)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
int answ; int answ;
@ -638,7 +638,8 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
return -ENOIOCTLCMD; return -ENOIOCTLCMD;
} }
return put_user(answ, (int __user *)arg); *karg = answ;
return 0;
} }
EXPORT_SYMBOL(tcp_ioctl); EXPORT_SYMBOL(tcp_ioctl);

View File

@ -1717,21 +1717,19 @@ static int first_packet_length(struct sock *sk)
* IOCTL requests applicable to the UDP protocol * IOCTL requests applicable to the UDP protocol
*/ */
int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) int udp_ioctl(struct sock *sk, int cmd, int *karg)
{ {
switch (cmd) { switch (cmd) {
case SIOCOUTQ: case SIOCOUTQ:
{ {
int amount = sk_wmem_alloc_get(sk); *karg = sk_wmem_alloc_get(sk);
return 0;
return put_user(amount, (int __user *)arg);
} }
case SIOCINQ: case SIOCINQ:
{ {
int amount = max_t(int, 0, first_packet_length(sk)); *karg = max_t(int, 0, first_packet_length(sk));
return 0;
return put_user(amount, (int __user *)arg);
} }
default: default:

View File

@ -594,7 +594,7 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
prot = READ_ONCE(sk->sk_prot); prot = READ_ONCE(sk->sk_prot);
if (!prot->ioctl) if (!prot->ioctl)
return -ENOIOCTLCMD; return -ENOIOCTLCMD;
return prot->ioctl(sk, cmd, arg); return sk_ioctl(sk, cmd, (void __user *)arg);
} }
/*NOTREACHED*/ /*NOTREACHED*/
return 0; return 0;

View File

@ -1853,11 +1853,10 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval,
/* /*
* The IP multicast ioctl support routines. * The IP multicast ioctl support routines.
*/ */
int ip6mr_ioctl(struct sock *sk, int cmd, void *arg)
int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
{ {
struct sioc_sg_req6 sr; struct sioc_sg_req6 *sr;
struct sioc_mif_req6 vr; struct sioc_mif_req6 *vr;
struct vif_device *vif; struct vif_device *vif;
struct mfc6_cache *c; struct mfc6_cache *c;
struct net *net = sock_net(sk); struct net *net = sock_net(sk);
@ -1869,40 +1868,33 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
switch (cmd) { switch (cmd) {
case SIOCGETMIFCNT_IN6: case SIOCGETMIFCNT_IN6:
if (copy_from_user(&vr, arg, sizeof(vr))) vr = (struct sioc_mif_req6 *)arg;
return -EFAULT; if (vr->mifi >= mrt->maxvif)
if (vr.mifi >= mrt->maxvif)
return -EINVAL; return -EINVAL;
vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif); vr->mifi = array_index_nospec(vr->mifi, mrt->maxvif);
read_lock(&mrt_lock); read_lock(&mrt_lock);
vif = &mrt->vif_table[vr.mifi]; vif = &mrt->vif_table[vr->mifi];
if (VIF_EXISTS(mrt, vr.mifi)) { if (VIF_EXISTS(mrt, vr->mifi)) {
vr.icount = vif->pkt_in; vr->icount = vif->pkt_in;
vr.ocount = vif->pkt_out; vr->ocount = vif->pkt_out;
vr.ibytes = vif->bytes_in; vr->ibytes = vif->bytes_in;
vr.obytes = vif->bytes_out; vr->obytes = vif->bytes_out;
read_unlock(&mrt_lock); read_unlock(&mrt_lock);
if (copy_to_user(arg, &vr, sizeof(vr)))
return -EFAULT;
return 0; return 0;
} }
read_unlock(&mrt_lock); read_unlock(&mrt_lock);
return -EADDRNOTAVAIL; return -EADDRNOTAVAIL;
case SIOCGETSGCNT_IN6: case SIOCGETSGCNT_IN6:
if (copy_from_user(&sr, arg, sizeof(sr))) sr = (struct sioc_sg_req6 *)arg;
return -EFAULT;
rcu_read_lock(); rcu_read_lock();
c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); c = ip6mr_cache_find(mrt, &sr->src.sin6_addr,
&sr->grp.sin6_addr);
if (c) { if (c) {
sr.pktcnt = c->_c.mfc_un.res.pkt; sr->pktcnt = c->_c.mfc_un.res.pkt;
sr.bytecnt = c->_c.mfc_un.res.bytes; sr->bytecnt = c->_c.mfc_un.res.bytes;
sr.wrong_if = c->_c.mfc_un.res.wrong_if; sr->wrong_if = c->_c.mfc_un.res.wrong_if;
rcu_read_unlock(); rcu_read_unlock();
if (copy_to_user(arg, &sr, sizeof(sr)))
return -EFAULT;
return 0; return 0;
} }
rcu_read_unlock(); rcu_read_unlock();

View File

@ -1116,29 +1116,29 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname,
return do_rawv6_getsockopt(sk, level, optname, optval, optlen); return do_rawv6_getsockopt(sk, level, optname, optval, optlen);
} }
static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg) static int rawv6_ioctl(struct sock *sk, int cmd, int *karg)
{ {
switch (cmd) { switch (cmd) {
case SIOCOUTQ: { case SIOCOUTQ: {
int amount = sk_wmem_alloc_get(sk); *karg = sk_wmem_alloc_get(sk);
return 0;
return put_user(amount, (int __user *)arg);
} }
case SIOCINQ: { case SIOCINQ: {
struct sk_buff *skb; struct sk_buff *skb;
int amount = 0;
spin_lock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_receive_queue.lock);
skb = skb_peek(&sk->sk_receive_queue); skb = skb_peek(&sk->sk_receive_queue);
if (skb) if (skb)
amount = skb->len; *karg = skb->len;
else
*karg = 0;
spin_unlock_bh(&sk->sk_receive_queue.lock); spin_unlock_bh(&sk->sk_receive_queue.lock);
return put_user(amount, (int __user *)arg); return 0;
} }
default: default:
#ifdef CONFIG_IPV6_MROUTE #ifdef CONFIG_IPV6_MROUTE
return ip6mr_ioctl(sk, cmd, (void __user *)arg); return ip6mr_ioctl(sk, cmd, karg);
#else #else
return -ENOIOCTLCMD; return -ENOIOCTLCMD;
#endif #endif

View File

@ -272,7 +272,7 @@ int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops
void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type); void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type);
/* IOCTL helper for IP encap modules. */ /* IOCTL helper for IP encap modules. */
int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg); int l2tp_ioctl(struct sock *sk, int cmd, int *karg);
/* Extract the tunnel structure from a socket's sk_user_data pointer, /* Extract the tunnel structure from a socket's sk_user_data pointer,
* validating the tunnel magic feather. * validating the tunnel magic feather.

View File

@ -563,19 +563,18 @@ out:
return err ? err : copied; return err ? err : copied;
} }
int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg) int l2tp_ioctl(struct sock *sk, int cmd, int *karg)
{ {
struct sk_buff *skb; struct sk_buff *skb;
int amount;
switch (cmd) { switch (cmd) {
case SIOCOUTQ: case SIOCOUTQ:
amount = sk_wmem_alloc_get(sk); *karg = sk_wmem_alloc_get(sk);
break; break;
case SIOCINQ: case SIOCINQ:
spin_lock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_receive_queue.lock);
skb = skb_peek(&sk->sk_receive_queue); skb = skb_peek(&sk->sk_receive_queue);
amount = skb ? skb->len : 0; *karg = skb ? skb->len : 0;
spin_unlock_bh(&sk->sk_receive_queue.lock); spin_unlock_bh(&sk->sk_receive_queue.lock);
break; break;
@ -583,7 +582,7 @@ int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg)
return -ENOIOCTLCMD; return -ENOIOCTLCMD;
} }
return put_user(amount, (int __user *)arg); return 0;
} }
EXPORT_SYMBOL_GPL(l2tp_ioctl); EXPORT_SYMBOL_GPL(l2tp_ioctl);

View File

@ -3620,11 +3620,10 @@ static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v)
return (int)delta; return (int)delta;
} }
static int mptcp_ioctl(struct sock *sk, int cmd, unsigned long arg) static int mptcp_ioctl(struct sock *sk, int cmd, int *karg)
{ {
struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sock *msk = mptcp_sk(sk);
bool slow; bool slow;
int answ;
switch (cmd) { switch (cmd) {
case SIOCINQ: case SIOCINQ:
@ -3633,24 +3632,24 @@ static int mptcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
lock_sock(sk); lock_sock(sk);
__mptcp_move_skbs(msk); __mptcp_move_skbs(msk);
answ = mptcp_inq_hint(sk); *karg = mptcp_inq_hint(sk);
release_sock(sk); release_sock(sk);
break; break;
case SIOCOUTQ: case SIOCOUTQ:
slow = lock_sock_fast(sk); slow = lock_sock_fast(sk);
answ = mptcp_ioctl_outq(msk, READ_ONCE(msk->snd_una)); *karg = mptcp_ioctl_outq(msk, READ_ONCE(msk->snd_una));
unlock_sock_fast(sk, slow); unlock_sock_fast(sk, slow);
break; break;
case SIOCOUTQNSD: case SIOCOUTQNSD:
slow = lock_sock_fast(sk); slow = lock_sock_fast(sk);
answ = mptcp_ioctl_outq(msk, msk->snd_nxt); *karg = mptcp_ioctl_outq(msk, msk->snd_nxt);
unlock_sock_fast(sk, slow); unlock_sock_fast(sk, slow);
break; break;
default: default:
return -ENOIOCTLCMD; return -ENOIOCTLCMD;
} }
return put_user(answ, (int __user *)arg); return 0;
} }
static void mptcp_subflow_early_fallback(struct mptcp_sock *msk, static void mptcp_subflow_early_fallback(struct mptcp_sock *msk,

View File

@ -28,24 +28,21 @@ static void pn_sock_close(struct sock *sk, long timeout)
sk_common_release(sk); sk_common_release(sk);
} }
static int pn_ioctl(struct sock *sk, int cmd, unsigned long arg) static int pn_ioctl(struct sock *sk, int cmd, int *karg)
{ {
struct sk_buff *skb; struct sk_buff *skb;
int answ;
switch (cmd) { switch (cmd) {
case SIOCINQ: case SIOCINQ:
lock_sock(sk); lock_sock(sk);
skb = skb_peek(&sk->sk_receive_queue); skb = skb_peek(&sk->sk_receive_queue);
answ = skb ? skb->len : 0; *karg = skb ? skb->len : 0;
release_sock(sk); release_sock(sk);
return put_user(answ, (int __user *)arg); return 0;
case SIOCPNADDRESOURCE: case SIOCPNADDRESOURCE:
case SIOCPNDELRESOURCE: { case SIOCPNDELRESOURCE: {
u32 res; u32 res = *karg;
if (get_user(res, (u32 __user *)arg))
return -EFAULT;
if (res >= 256) if (res >= 256)
return -EINVAL; return -EINVAL;
if (cmd == SIOCPNADDRESOURCE) if (cmd == SIOCPNADDRESOURCE)

View File

@ -916,10 +916,9 @@ static int pep_sock_enable(struct sock *sk, struct sockaddr *addr, int len)
return 0; return 0;
} }
static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg) static int pep_ioctl(struct sock *sk, int cmd, int *karg)
{ {
struct pep_sock *pn = pep_sk(sk); struct pep_sock *pn = pep_sk(sk);
int answ;
int ret = -ENOIOCTLCMD; int ret = -ENOIOCTLCMD;
switch (cmd) { switch (cmd) {
@ -932,13 +931,13 @@ static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg)
lock_sock(sk); lock_sock(sk);
if (sock_flag(sk, SOCK_URGINLINE) && if (sock_flag(sk, SOCK_URGINLINE) &&
!skb_queue_empty(&pn->ctrlreq_queue)) !skb_queue_empty(&pn->ctrlreq_queue))
answ = skb_peek(&pn->ctrlreq_queue)->len; *karg = skb_peek(&pn->ctrlreq_queue)->len;
else if (!skb_queue_empty(&sk->sk_receive_queue)) else if (!skb_queue_empty(&sk->sk_receive_queue))
answ = skb_peek(&sk->sk_receive_queue)->len; *karg = skb_peek(&sk->sk_receive_queue)->len;
else else
answ = 0; *karg = 0;
release_sock(sk); release_sock(sk);
ret = put_user(answ, (int __user *)arg); ret = 0;
break; break;
case SIOCPNENABLEPIPE: case SIOCPNENABLEPIPE:

View File

@ -387,7 +387,7 @@ static int pn_socket_ioctl(struct socket *sock, unsigned int cmd,
return put_user(handle, (__u16 __user *)arg); return put_user(handle, (__u16 __user *)arg);
} }
return sk->sk_prot->ioctl(sk, cmd, arg); return sk_ioctl(sk, cmd, (void __user *)arg);
} }
static int pn_socket_listen(struct socket *sock, int backlog) static int pn_socket_listen(struct socket *sock, int backlog)

Some files were not shown because too many files have changed in this diff Show More