Merge: io_uring: update to upstream v6.6

MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/3318

Update io_uring and its dependencies to upstream kernel version 6.6.

JIRA: https://issues.redhat.com/browse/RHEL-12076
JIRA: https://issues.redhat.com/browse/RHEL-14998
JIRA: https://issues.redhat.com/browse/RHEL-4447
CVE: CVE-2023-46862

Omitted-Fix: ab69838e7c75 ("io_uring/kbuf: Fix check of BID wrapping in provided buffers")
Omitted-Fix: f74c746e476b ("io_uring/kbuf: Allow the full buffer id space for provided buffers")

This is the list of new features available (includes upstream kernel versions 6.3-6.6):

    User-specified ring buffer
    Provided Buffers allocated by the kernel
    Ability to register the ring fd
    Multi-shot timeouts
    ability to pass custom flags to the completion queue entry for ring messages

All of these features are covered by the liburing tests.

In my testing, no-mmap-inval.t failed because of a broken test.  socket-uring-cmd.t also failed because of a missing selinux policy rule.  Try running audit2allow if you see a failure in that test.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>

Approved-by: Wander Lairson Costa <wander@redhat.com>
Approved-by: Donald Dutile <ddutile@redhat.com>
Approved-by: Chris von Recklinghausen <crecklin@redhat.com>
Approved-by: Jiri Benc <jbenc@redhat.com>
Approved-by: Ming Lei <ming.lei@redhat.com>

Signed-off-by: Scott Weaver <scweaver@redhat.com>
This commit is contained in:
Scott Weaver 2023-12-16 14:38:47 -05:00
commit 8d95883db0
112 changed files with 3236 additions and 3674 deletions

View File

@ -448,17 +448,26 @@ io_uring_disabled
Prevents all processes from creating new io_uring instances. Enabling this
shrinks the kernel's attack surface.
= ==================================================================
0 All processes can create io_uring instances as normal. This is the
default setting.
1 io_uring creation is disabled for unprivileged processes.
io_uring_setup fails with -EPERM unless the calling process is
privileged (CAP_SYS_ADMIN). Existing io_uring instances can
still be used.
2 io_uring creation is disabled for all processes. io_uring_setup
= ======================================================================
0 All processes can create io_uring instances as normal.
1 io_uring creation is disabled (io_uring_setup() will fail with
-EPERM) for unprivileged processes not in the io_uring_group group.
Existing io_uring instances can still be used. See the
documentation for io_uring_group for more information.
2 io_uring creation is disabled for all processes. io_uring_setup()
always fails with -EPERM. Existing io_uring instances can still be
used.
= ==================================================================
used. This is the default setting.
= ======================================================================
io_uring_group
==============
When io_uring_disabled is set to 1, a process must either be
privileged (CAP_SYS_ADMIN) or be in the io_uring_group group in order
to create an io_uring instance. If io_uring_group is set to -1 (the
default), only processes with the CAP_SYS_ADMIN capability may create
io_uring instances.
kexec_load_disabled

View File

@ -10086,7 +10086,6 @@ F: io_uring/
F: include/linux/io_uring.h
F: include/linux/io_uring_types.h
F: include/uapi/linux/io_uring.h
F: tools/io_uring/
IPMI SUBSYSTEM
M: Corey Minyard <minyard@acm.org>

View File

@ -29,9 +29,11 @@ static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data,
bmd = kmalloc(struct_size(bmd, iov, data->nr_segs), gfp_mask);
if (!bmd)
return NULL;
memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs);
bmd->iter = *data;
bmd->iter.iov = bmd->iov;
if (iter_is_iovec(data)) {
memcpy(bmd->iov, iter_iov(data), sizeof(struct iovec) * data->nr_segs);
bmd->iter.__iov = bmd->iov;
}
return bmd;
}
@ -636,7 +638,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
copy = true;
else if (iov_iter_is_bvec(iter))
map_bvec = true;
else if (!iter_is_iovec(iter))
else if (!user_backed_iter(iter))
copy = true;
else if (queue_virt_boundary(q))
copy = queue_virt_boundary(q) & iov_iter_gap_alignment(iter);
@ -677,9 +679,8 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
struct rq_map_data *map_data, void __user *ubuf,
unsigned long len, gfp_t gfp_mask)
{
struct iovec iov;
struct iov_iter i;
int ret = import_single_range(rq_data_dir(rq), ubuf, len, &iov, &i);
int ret = import_ubuf(rq_data_dir(rq), ubuf, len, &i);
if (unlikely(ret < 0))
return ret;

View File

@ -506,7 +506,7 @@ static int blkdev_open(struct inode *inode, struct file *filp)
* during an unstable branch.
*/
filp->f_flags |= O_LARGEFILE;
filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
filp->f_mode |= FMODE_BUF_RASYNC;
/*
* Use the file private data to store the holder for exclusive openes.
@ -520,6 +520,9 @@ static int blkdev_open(struct inode *inode, struct file *filp)
if (IS_ERR(bdev))
return PTR_ERR(bdev);
if (bdev_nowait(bdev))
filp->f_mode |= FMODE_NOWAIT;
filp->f_mapping = bdev->bd_inode->i_mapping;
filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
return 0;

View File

@ -362,7 +362,7 @@ static unsigned zero_mmap_capabilities(struct file *file)
/* can't do an in-place private mapping if there's no MMU */
static inline int private_mapping_ok(struct vm_area_struct *vma)
{
return vma->vm_flags & VM_MAYSHARE;
return is_nommu_shared_mapping(vma->vm_flags);
}
#else

View File

@ -284,11 +284,12 @@ static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from)
}
while (dim) {
const struct iovec *iov = iter_iov(from);
int ret;
unsigned long count = 0;
ret = hfi1_user_sdma_process_request(
fd, (struct iovec *)(from->iov + done),
fd, (struct iovec *)(iov + done),
dim, &count);
if (ret) {
reqs = ret;

View File

@ -2246,10 +2246,10 @@ static ssize_t qib_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct qib_ctxtdata *rcd = ctxt_fp(iocb->ki_filp);
struct qib_user_sdma_queue *pq = fp->pq;
if (!iter_is_iovec(from) || !from->nr_segs || !pq)
if (!from->user_backed || !from->nr_segs || !pq)
return -EINVAL;
return qib_user_sdma_writev(rcd, pq, from->iov, from->nr_segs);
return qib_user_sdma_writev(rcd, pq, iter_iov(from), from->nr_segs);
}
static struct class *qib_class;

View File

@ -1473,7 +1473,8 @@ static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
skb->truesize += skb->data_len;
for (i = 1; i < it->nr_segs; i++) {
size_t fragsz = it->iov[i].iov_len;
const struct iovec *iov = iter_iov(it);
size_t fragsz = iov->iov_len;
struct page *page;
void *frag;

View File

@ -551,7 +551,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec)
{
struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
const struct nvme_uring_cmd *cmd = ioucmd->cmd;
const struct nvme_uring_cmd *cmd = io_uring_sqe_cmd(ioucmd->sqe);
struct request_queue *q = ns ? ns->queue : ctrl->admin_q;
struct nvme_uring_data d;
struct nvme_command c;

View File

@ -1246,7 +1246,7 @@ static ssize_t ffs_epfile_read_iter(struct kiocb *kiocb, struct iov_iter *to)
p->kiocb = kiocb;
if (p->aio) {
p->to_free = dup_iter(&p->data, to, GFP_KERNEL);
if (!p->to_free) {
if (!iter_is_ubuf(&p->data) && !p->to_free) {
kfree(p);
return -ENOMEM;
}

View File

@ -613,7 +613,7 @@ ep_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (!priv)
goto fail;
priv->to_free = dup_iter(&priv->to, to, GFP_KERNEL);
if (!priv->to_free) {
if (!iter_is_ubuf(&priv->to) && !priv->to_free) {
kfree(priv);
goto fail;
}

View File

@ -641,7 +641,7 @@ vhost_scsi_calc_sgls(struct iov_iter *iter, size_t bytes, int max_sgls)
{
int sgl_count = 0;
if (!iter || !iter->iov) {
if (!iter || !iter_iov(iter)) {
pr_err("%s: iter->iov is NULL, but expected bytes: %zu"
" present\n", __func__, bytes);
return -EINVAL;

View File

@ -3621,10 +3621,15 @@ static int check_direct_read(struct btrfs_fs_info *fs_info,
if (!iter_is_iovec(iter))
return 0;
for (seg = 0; seg < iter->nr_segs; seg++)
for (i = seg + 1; i < iter->nr_segs; i++)
if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
for (seg = 0; seg < iter->nr_segs; seg++) {
for (i = seg + 1; i < iter->nr_segs; i++) {
const struct iovec *iov1 = iter_iov(iter) + seg;
const struct iovec *iov2 = iter_iov(iter) + i;
if (iov1->iov_base == iov2->iov_base)
return -EINVAL;
}
}
return 0;
}

View File

@ -446,7 +446,7 @@ bailout:
static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma)
{
return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -ENOSYS;
}
static unsigned long cramfs_physmem_get_unmapped_area(struct file *file,

View File

@ -153,8 +153,6 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
path_noexec(&file->f_path)))
goto exit;
fsnotify_open(file);
error = -ENOEXEC;
read_lock(&binfmt_lock);
@ -939,9 +937,6 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
if (err)
goto exit;
if (name->name[0] != '\0')
fsnotify_open(file);
out:
return file;

View File

@ -902,7 +902,8 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
return ret;
}
filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC |
FMODE_DIO_PARALLEL_WRITE;
return dquot_file_open(inode, filp);
}

View File

@ -236,7 +236,6 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh,
retval = PTR_ERR(file);
} else {
retval = fd;
fsnotify_open(file);
fd_install(fd, file);
}
path_put(&path);

View File

@ -1370,7 +1370,7 @@ out:
static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
{
return (unsigned long)ii->iov->iov_base + ii->iov_offset;
return (unsigned long)iter_iov(ii)->iov_base + ii->iov_offset;
}
static inline size_t fuse_get_frag_size(const struct iov_iter *ii,

View File

@ -19,10 +19,12 @@
* Private flags for iomap_dio, must not overlap with the public ones in
* iomap.h:
*/
#define IOMAP_DIO_WRITE_FUA (1 << 28)
#define IOMAP_DIO_NEED_SYNC (1 << 29)
#define IOMAP_DIO_WRITE (1 << 30)
#define IOMAP_DIO_DIRTY (1 << 31)
#define IOMAP_DIO_CALLER_COMP (1U << 26)
#define IOMAP_DIO_INLINE_COMP (1U << 27)
#define IOMAP_DIO_WRITE_THROUGH (1U << 28)
#define IOMAP_DIO_NEED_SYNC (1U << 29)
#define IOMAP_DIO_WRITE (1U << 30)
#define IOMAP_DIO_DIRTY (1U << 31)
struct iomap_dio {
struct kiocb *iocb;
@ -40,7 +42,6 @@ struct iomap_dio {
struct {
struct iov_iter *iter;
struct task_struct *waiter;
struct bio *poll_bio;
} submit;
/* used for aio completion: */
@ -53,12 +54,14 @@ struct iomap_dio {
static void iomap_dio_submit_bio(const struct iomap_iter *iter,
struct iomap_dio *dio, struct bio *bio, loff_t pos)
{
struct kiocb *iocb = dio->iocb;
atomic_inc(&dio->ref);
/* Sync dio can't be polled reliably */
if ((dio->iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(dio->iocb)) {
bio_set_polled(bio, dio->iocb);
dio->submit.poll_bio = bio;
if ((iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(iocb)) {
bio_set_polled(bio, iocb);
WRITE_ONCE(iocb->private, bio);
}
if (dio->dops && dio->dops->submit_io)
@ -126,6 +129,11 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
}
EXPORT_SYMBOL_GPL(iomap_dio_complete);
static ssize_t iomap_dio_deferred_complete(void *data)
{
return iomap_dio_complete(data);
}
static void iomap_dio_complete_work(struct work_struct *work)
{
struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
@ -148,27 +156,69 @@ static void iomap_dio_bio_end_io(struct bio *bio)
{
struct iomap_dio *dio = bio->bi_private;
bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
struct kiocb *iocb = dio->iocb;
if (bio->bi_status)
iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
if (!atomic_dec_and_test(&dio->ref))
goto release_bio;
if (atomic_dec_and_test(&dio->ref)) {
if (dio->wait_for_completion) {
struct task_struct *waiter = dio->submit.waiter;
WRITE_ONCE(dio->submit.waiter, NULL);
blk_wake_io_task(waiter);
} else if (dio->flags & IOMAP_DIO_WRITE) {
struct inode *inode = file_inode(dio->iocb->ki_filp);
/*
* Synchronous dio, task itself will handle any completion work
* that needs after IO. All we need to do is wake the task.
*/
if (dio->wait_for_completion) {
struct task_struct *waiter = dio->submit.waiter;
WRITE_ONCE(dio->iocb->private, NULL);
INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
} else {
WRITE_ONCE(dio->iocb->private, NULL);
iomap_dio_complete_work(&dio->aio.work);
}
WRITE_ONCE(dio->submit.waiter, NULL);
blk_wake_io_task(waiter);
goto release_bio;
}
/*
* Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline
*/
if (dio->flags & IOMAP_DIO_INLINE_COMP) {
WRITE_ONCE(iocb->private, NULL);
iomap_dio_complete_work(&dio->aio.work);
goto release_bio;
}
/*
* If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule
* our completion that way to avoid an async punt to a workqueue.
*/
if (dio->flags & IOMAP_DIO_CALLER_COMP) {
/* only polled IO cares about private cleared */
iocb->private = dio;
iocb->dio_complete = iomap_dio_deferred_complete;
/*
* Invoke ->ki_complete() directly. We've assigned our
* dio_complete callback handler, and since the issuer set
* IOCB_DIO_CALLER_COMP, we know their ki_complete handler will
* notice ->dio_complete being set and will defer calling that
* handler until it can be done from a safe task context.
*
* Note that the 'res' being passed in here is not important
* for this case. The actual completion value of the request
* will be gotten from dio_complete when that is run by the
* issuer.
*/
iocb->ki_complete(iocb, 0);
goto release_bio;
}
/*
* Async DIO completion that requires filesystem level completion work
* gets punted to a work queue to complete as the operation may require
* more IO to be issued to finalise filesystem metadata changes or
* guarantee data integrity.
*/
INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq,
&dio->aio.work);
release_bio:
if (should_dirty) {
bio_check_pages_dirty(bio);
} else {
@ -197,7 +247,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
/*
* Figure out the bio's operation flags from the dio request, the
* mapping, and whether or not we want FUA. Note that we can end up
* clearing the WRITE_FUA flag in the dio request.
* clearing the WRITE_THROUGH flag in the dio request.
*/
static inline unsigned int iomap_dio_bio_opflags(struct iomap_dio *dio,
const struct iomap *iomap, bool use_fua)
@ -217,7 +267,7 @@ static inline unsigned int iomap_dio_bio_opflags(struct iomap_dio *dio,
if (use_fua)
opflags |= REQ_FUA;
else
dio->flags &= ~IOMAP_DIO_WRITE_FUA;
dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
return opflags;
}
@ -258,12 +308,19 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
* Use a FUA write if we need datasync semantics, this is a pure
* data IO that doesn't require any metadata updates (including
* after IO completion such as unwritten extent conversion) and
* the underlying device supports FUA. This allows us to avoid
* cache flushes on IO completion.
* the underlying device either supports FUA or doesn't have
* a volatile write cache. This allows us to avoid cache flushes
* on IO completion. If we can't use writethrough and need to
* sync, disable in-task completions as dio completion will
* need to call generic_write_sync() which will do a blocking
* fsync / cache flush call.
*/
if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
(dio->flags & IOMAP_DIO_WRITE_FUA) && bdev_fua(iomap->bdev))
(dio->flags & IOMAP_DIO_WRITE_THROUGH) &&
(bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev)))
use_fua = true;
else if (dio->flags & IOMAP_DIO_NEED_SYNC)
dio->flags &= ~IOMAP_DIO_CALLER_COMP;
}
/*
@ -278,10 +335,23 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
goto out;
/*
* We can only poll for single bio I/Os.
* We can only do deferred completion for pure overwrites that
* don't require additional IO at completion. This rules out
* writes that need zeroing or extent conversion, extend
* the file size, or issue journal IO or cache flushes
* during completion processing.
*/
if (need_zeroout ||
((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) ||
((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
dio->flags &= ~IOMAP_DIO_CALLER_COMP;
/*
* The rules for polled IO completions follow the guidelines as the
* ones we set for inline and deferred completions. If none of those
* are available for this IO, clear the polled flag.
*/
if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP)))
dio->iocb->ki_flags &= ~IOCB_HIPRI;
if (need_zeroout) {
@ -502,9 +572,11 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->submit.iter = iter;
dio->submit.waiter = current;
dio->submit.poll_bio = NULL;
if (iov_iter_rw(iter) == READ) {
/* reads can always complete inline */
dio->flags |= IOMAP_DIO_INLINE_COMP;
if (iomi.pos >= dio->i_size)
goto out_free_dio;
@ -523,6 +595,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
iomi.flags |= IOMAP_WRITE;
dio->flags |= IOMAP_DIO_WRITE;
/*
* Flag as supporting deferred completions, if the issuer
* groks it. This can avoid a workqueue punt for writes.
* We may later clear this flag if we need to do other IO
* as part of this IO completion.
*/
if (iocb->ki_flags & IOCB_DIO_CALLER_COMP)
dio->flags |= IOMAP_DIO_CALLER_COMP;
if (iocb->ki_flags & IOCB_NOWAIT) {
if (filemap_range_has_page(mapping, iomi.pos, end)) {
ret = -EAGAIN;
@ -536,13 +617,16 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->flags |= IOMAP_DIO_NEED_SYNC;
/*
* For datasync only writes, we optimistically try using FUA for
* this IO. Any non-FUA write that occurs will clear this flag,
* hence we know before completion whether a cache flush is
* necessary.
* For datasync only writes, we optimistically try using
* WRITE_THROUGH for this IO. This flag requires either
* FUA writes through the device's write cache, or a
* normal write to a device without a volatile write
* cache. For the former, Any non-FUA write that occurs
* will clear this flag, hence we know before completion
* whether a cache flush is necessary.
*/
if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
dio->flags |= IOMAP_DIO_WRITE_FUA;
dio->flags |= IOMAP_DIO_WRITE_THROUGH;
}
if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
@ -615,14 +699,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
iomap_dio_set_error(dio, ret);
/*
* If all the writes we issued were FUA, we don't need to flush the
* cache on IO completion. Clear the sync flag for this case.
* If all the writes we issued were already written through to the
* media, we don't need to flush the cache on IO completion. Clear the
* sync flag for this case.
*/
if (dio->flags & IOMAP_DIO_WRITE_FUA)
if (dio->flags & IOMAP_DIO_WRITE_THROUGH)
dio->flags &= ~IOMAP_DIO_NEED_SYNC;
WRITE_ONCE(iocb->private, dio->submit.poll_bio);
/*
* We are about to drop our additional submission reference, which
* might be the last reference to the dio. There are three different

View File

@ -886,6 +886,11 @@ static int do_dentry_open(struct file *f,
truncate_pagecache(inode, 0);
}
/*
* Once we return a file with FMODE_OPENED, __fput() will call
* fsnotify_close(), so we need fsnotify_open() here for symmetry.
*/
fsnotify_open(f);
return 0;
cleanup_all:
@ -1270,7 +1275,6 @@ static long do_sys_openat2(int dfd, const char __user *filename,
put_unused_fd(fd);
fd = PTR_ERR(f);
} else {
fsnotify_open(f);
fd_install(fd, f);
}
}

View File

@ -390,6 +390,12 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
if (!ovl_should_sync(OVL_FS(inode->i_sb)))
ifl &= ~(IOCB_DSYNC | IOCB_SYNC);
/*
* Overlayfs doesn't support deferred completions, don't copy
* this property in case it is set by the issuer.
*/
ifl &= ~IOCB_DIO_CALLER_COMP;
old_cred = ovl_override_creds(file_inode(file)->i_sb);
if (is_sync_kiocb(iocb)) {
file_start_write(real.file);

View File

@ -40,7 +40,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
}
if (atomic_read(&mm->mm_count) > 1 ||
vma->vm_flags & VM_MAYSHARE) {
is_nommu_shared_mapping(vma->vm_flags)) {
sbytes += size;
} else {
bytes += size;

View File

@ -264,7 +264,7 @@ out:
*/
static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
{
if (!(vma->vm_flags & (VM_SHARED | VM_MAYSHARE)))
if (!is_nommu_shared_mapping(vma->vm_flags))
return -ENOSYS;
file_accessed(file);

View File

@ -749,15 +749,14 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
return -EOPNOTSUPP;
while (iov_iter_count(iter)) {
struct iovec iovec = iov_iter_iovec(iter);
ssize_t nr;
if (type == READ) {
nr = filp->f_op->read(filp, iovec.iov_base,
iovec.iov_len, ppos);
nr = filp->f_op->read(filp, iter_iov_addr(iter),
iter_iov_len(iter), ppos);
} else {
nr = filp->f_op->write(filp, iovec.iov_base,
iovec.iov_len, ppos);
nr = filp->f_op->write(filp, iter_iov_addr(iter),
iter_iov_len(iter), ppos);
}
if (nr < 0) {
@ -766,7 +765,7 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
break;
}
ret += nr;
if (nr != iovec.iov_len)
if (nr != iter_iov_len(iter))
break;
iov_iter_advance(iter, nr);
}

View File

@ -63,7 +63,7 @@ static unsigned long romfs_get_unmapped_area(struct file *file,
*/
static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
{
return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -ENOSYS;
}
static unsigned romfs_mmap_capabilities(struct file *file)

View File

@ -1171,7 +1171,8 @@ xfs_file_open(
{
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC;
file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
FMODE_DIO_PARALLEL_WRITE;
return generic_file_open(inode, file);
}

View File

@ -159,6 +159,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
/* File supports DIRECT IO */
#define FMODE_CAN_ODIRECT ((__force fmode_t)0x400000)
/* File supports non-exclusive O_DIRECT writes from multiple threads */
#define FMODE_DIO_PARALLEL_WRITE ((__force fmode_t)0x1000000)
/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
@ -312,20 +315,60 @@ enum rw_hint {
#define IOCB_NOIO (1 << 20)
/* can use bio alloc cache */
#define IOCB_ALLOC_CACHE (1 << 21)
/*
* IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the
* iocb completion can be passed back to the owner for execution from a safe
* context rather than needing to be punted through a workqueue. If this
* flag is set, the bio completion handling may set iocb->dio_complete to a
* handler function and iocb->private to context information for that handler.
* The issuer should call the handler with that context information from task
* context to complete the processing of the iocb. Note that while this
* provides a task context for the dio_complete() callback, it should only be
* used on the completion side for non-IO generating completions. It's fine to
* call blocking functions from this callback, but they should not wait for
* unrelated IO (like cache flushing, new IO generation, etc).
*/
#define IOCB_DIO_CALLER_COMP (1 << 22)
/* for use in trace events */
#define TRACE_IOCB_STRINGS \
{ IOCB_HIPRI, "HIPRI" }, \
{ IOCB_DSYNC, "DSYNC" }, \
{ IOCB_SYNC, "SYNC" }, \
{ IOCB_NOWAIT, "NOWAIT" }, \
{ IOCB_APPEND, "APPEND" }, \
{ IOCB_EVENTFD, "EVENTFD"}, \
{ IOCB_DIRECT, "DIRECT" }, \
{ IOCB_WRITE, "WRITE" }, \
{ IOCB_WAITQ, "WAITQ" }, \
{ IOCB_NOIO, "NOIO" }, \
{ IOCB_ALLOC_CACHE, "ALLOC_CACHE" }, \
{ IOCB_DIO_CALLER_COMP, "CALLER_COMP" }
struct kiocb {
struct file *ki_filp;
/* The 'ki_filp' pointer is shared in a union for aio */
randomized_struct_fields_start
loff_t ki_pos;
void (*ki_complete)(struct kiocb *iocb, long ret);
void *private;
int ki_flags;
u16 ki_ioprio; /* See linux/ioprio.h */
struct wait_page_queue *ki_waitq; /* for async buffered IO */
randomized_struct_fields_end
union {
/*
* Only used for async buffered reads, where it denotes the
* page waitqueue associated with completing the read. Valid
* IFF IOCB_WAITQ is set.
*/
struct wait_page_queue *ki_waitq;
/*
* Can be used for O_DIRECT IO, where the completion handling
* is punted back to the issuer of the IO. May only be set
* if IOCB_DIO_CALLER_COMP is set by the issuer, and the issuer
* must then check for presence of this handler when ki_complete
* is invoked. The data passed in to this handler must be
* assigned to ->private when dio_complete is assigned.
*/
ssize_t (*dio_complete)(void *data);
};
};
static inline bool is_sync_kiocb(struct kiocb *kiocb)

View File

@ -24,7 +24,7 @@ enum io_uring_cmd_flags {
struct io_uring_cmd {
struct file *file;
const void *cmd;
const struct io_uring_sqe *sqe;
union {
/* callback to defer completions to task context */
void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned);
@ -36,18 +36,33 @@ struct io_uring_cmd {
u8 pdu[32]; /* available inline for free use */
};
static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
{
return sqe->cmd;
}
#if defined(CONFIG_IO_URING)
int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct iov_iter *iter, void *ioucmd);
void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2,
unsigned issue_flags);
void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned));
struct sock *io_uring_get_socket(struct file *file);
void __io_uring_cancel(bool cancel_all);
void __io_uring_free(struct task_struct *tsk);
void io_uring_unreg_ringfd(void);
const char *io_uring_get_opcode(u8 opcode);
void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned),
unsigned flags);
/* users should follow semantics of IOU_F_TWQ_LAZY_WAKE */
void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned));
static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned))
{
__io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0);
}
static inline void io_uring_files_cancel(void)
{
@ -66,6 +81,7 @@ static inline void io_uring_free(struct task_struct *tsk)
if (tsk->io_uring)
__io_uring_free(tsk);
}
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags);
#else
static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct iov_iter *iter, void *ioucmd)
@ -80,6 +96,10 @@ static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned))
{
}
static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned))
{
}
static inline struct sock *io_uring_get_socket(struct file *file)
{
return NULL;
@ -97,6 +117,11 @@ static inline const char *io_uring_get_opcode(u8 opcode)
{
return "";
}
static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd,
unsigned int issue_flags)
{
return -EOPNOTSUPP;
}
#endif
#endif

View File

@ -58,7 +58,7 @@ struct io_uring_task {
struct xarray xa;
struct wait_queue_head wait;
atomic_t in_idle;
atomic_t in_cancel;
atomic_t inflight_tracked;
struct percpu_counter inflight;
@ -69,8 +69,8 @@ struct io_uring_task {
};
struct io_uring {
u32 head ____cacheline_aligned_in_smp;
u32 tail ____cacheline_aligned_in_smp;
u32 head;
u32 tail;
};
/*
@ -176,7 +176,6 @@ struct io_submit_state {
unsigned short submit_nr;
unsigned int cqes_count;
struct blk_plug plug;
struct io_uring_cqe cqes[16];
};
struct io_ev_fd {
@ -188,28 +187,34 @@ struct io_ev_fd {
};
struct io_alloc_cache {
struct hlist_head list;
struct io_wq_work_node list;
unsigned int nr_cached;
unsigned int max_cached;
size_t elem_size;
};
struct io_ring_ctx {
/* const or read-mostly hot data */
struct {
struct percpu_ref refs;
struct io_rings *rings;
unsigned int flags;
enum task_work_notify_mode notify_method;
unsigned int compat: 1;
unsigned int drain_next: 1;
unsigned int restricted: 1;
unsigned int off_timeout_used: 1;
unsigned int drain_active: 1;
unsigned int drain_disabled: 1;
unsigned int has_evfd: 1;
unsigned int syscall_iopoll: 1;
/* all CQEs should be posted only by the submitter task */
unsigned int task_complete: 1;
unsigned int lockless_cq: 1;
unsigned int syscall_iopoll: 1;
unsigned int poll_activated: 1;
unsigned int drain_disabled: 1;
unsigned int compat: 1;
struct task_struct *submitter_task;
struct io_rings *rings;
struct percpu_ref refs;
enum task_work_notify_mode notify_method;
} ____cacheline_aligned_in_smp;
/* submission data */
@ -237,7 +242,6 @@ struct io_ring_ctx {
* uring_lock, and updated through io_uring_register(2)
*/
struct io_rsrc_node *rsrc_node;
int rsrc_cached_refs;
atomic_t cancel_seq;
struct io_file_table file_table;
unsigned nr_user_files;
@ -248,32 +252,21 @@ struct io_ring_ctx {
struct io_buffer_list *io_bl;
struct xarray io_bl_xa;
struct list_head io_buffers_cache;
struct io_hash_table cancel_table_locked;
struct list_head cq_overflow_list;
struct io_alloc_cache apoll_cache;
struct io_alloc_cache netmsg_cache;
/*
* ->iopoll_list is protected by the ctx->uring_lock for
* io_uring instances that don't use IORING_SETUP_SQPOLL.
* For SQPOLL, only the single threaded io_sq_thread() will
* manipulate the list, hence no extra locking is needed there.
*/
struct io_wq_work_list iopoll_list;
bool poll_multi_queue;
} ____cacheline_aligned_in_smp;
/* IRQ completion list, under ->completion_lock */
struct io_wq_work_list locked_free_list;
unsigned int locked_free_nr;
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
struct io_sq_data *sq_data; /* if using sq thread polling */
struct wait_queue_head sqo_sq_wait;
struct list_head sqd_list;
unsigned long check_cq;
unsigned int file_alloc_start;
unsigned int file_alloc_end;
struct xarray personalities;
u32 pers_next;
struct {
/*
* We cache a range of free CQEs we can use, once exhausted it
@ -285,54 +278,69 @@ struct io_ring_ctx {
unsigned cached_cq_tail;
unsigned cq_entries;
struct io_ev_fd __rcu *io_ev_fd;
struct wait_queue_head cq_wait;
unsigned cq_extra;
} ____cacheline_aligned_in_smp;
/*
* task_work and async notification delivery cacheline. Expected to
* regularly bounce b/w CPUs.
*/
struct {
spinlock_t completion_lock;
bool poll_multi_queue;
/*
* ->iopoll_list is protected by the ctx->uring_lock for
* io_uring instances that don't use IORING_SETUP_SQPOLL.
* For SQPOLL, only the single threaded io_sq_thread() will
* manipulate the list, hence no extra locking is needed there.
*/
struct io_wq_work_list iopoll_list;
struct io_hash_table cancel_table;
struct llist_head work_llist;
struct list_head io_buffers_comp;
unsigned long check_cq;
atomic_t cq_wait_nr;
atomic_t cq_timeouts;
struct wait_queue_head cq_wait;
} ____cacheline_aligned_in_smp;
/* timeouts */
struct {
spinlock_t timeout_lock;
atomic_t cq_timeouts;
struct list_head timeout_list;
struct list_head ltimeout_list;
unsigned cq_last_tm_flush;
} ____cacheline_aligned_in_smp;
/* Keep this last, we don't need it for the fast path */
struct io_uring_cqe completion_cqes[16];
spinlock_t completion_lock;
/* IRQ completion list, under ->completion_lock */
struct io_wq_work_list locked_free_list;
unsigned int locked_free_nr;
struct list_head io_buffers_comp;
struct list_head cq_overflow_list;
struct io_hash_table cancel_table;
const struct cred *sq_creds; /* cred used for __io_sq_thread() */
struct io_sq_data *sq_data; /* if using sq thread polling */
struct wait_queue_head sqo_sq_wait;
struct list_head sqd_list;
unsigned int file_alloc_start;
unsigned int file_alloc_end;
struct xarray personalities;
u32 pers_next;
struct list_head io_buffers_cache;
/* Keep this last, we don't need it for the fast path */
struct wait_queue_head poll_wq;
struct io_restriction restrictions;
struct task_struct *submitter_task;
/* slow path rsrc auxilary data, used by update/register */
struct io_rsrc_node *rsrc_backup_node;
struct io_mapped_ubuf *dummy_ubuf;
struct io_rsrc_data *file_data;
struct io_rsrc_data *buf_data;
struct delayed_work rsrc_put_work;
struct callback_head rsrc_put_tw;
struct llist_head rsrc_put_llist;
/* protected by ->uring_lock */
struct list_head rsrc_ref_list;
spinlock_t rsrc_ref_lock;
struct io_alloc_cache rsrc_node_cache;
struct wait_queue_head rsrc_quiesce_wq;
unsigned rsrc_quiesce;
struct list_head io_buffers_pages;
@ -357,10 +365,25 @@ struct io_ring_ctx {
u32 iowq_limits[2];
bool iowq_limits_set;
struct callback_head poll_wq_task_work;
struct list_head defer_list;
unsigned sq_thread_idle;
/* protected by ->completion_lock */
unsigned evfd_last_cq_tail;
/*
* If IORING_SETUP_NO_MMAP is used, then the below holds
* the gup'ed pages for the two rings, and the sqes.
*/
unsigned short n_ring_pages;
unsigned short n_sqe_pages;
struct page **ring_pages;
struct page **sqe_pages;
};
struct io_tw_state {
/* ->uring_lock is taken, callbacks can use io_tw_lock to lock it */
bool locked;
};
enum {
@ -391,7 +414,6 @@ enum {
REQ_F_SINGLE_POLL_BIT,
REQ_F_DOUBLE_POLL_BIT,
REQ_F_PARTIAL_IO_BIT,
REQ_F_CQE32_INIT_BIT,
REQ_F_APOLL_MULTISHOT_BIT,
REQ_F_CLEAR_POLLIN_BIT,
REQ_F_HASH_LOCKED_BIT,
@ -461,15 +483,13 @@ enum {
REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
/* fast poll multishot mode */
REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT),
/* ->extra1 and ->extra2 are initialised */
REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT),
/* recvmsg special flag, clear EPOLLIN */
REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT),
/* hashed into ->cancel_hash_locked, protected by ->uring_lock */
REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT),
};
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
struct io_task_work {
struct llist_node node;
@ -559,14 +579,9 @@ struct io_kiocb {
atomic_t refs;
atomic_t poll_refs;
struct io_task_work io_task_work;
unsigned nr_tw;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
union {
struct hlist_node hash_node;
struct {
u64 extra1;
u64 extra2;
};
};
struct hlist_node hash_node;
/* internal polling, see IORING_FEAT_FAST_POLL */
struct async_poll *apoll;
/* opcode allocated if it needs to store data for async defer */
@ -576,6 +591,11 @@ struct io_kiocb {
/* custom credentials, valid IFF REQ_F_CREDS is set */
const struct cred *creds;
struct io_wq_work work;
struct {
u64 extra1;
u64 extra2;
} big_cqe;
};
struct io_overflow_cqe {

View File

@ -1279,6 +1279,21 @@ static inline bool is_cow_mapping(vm_flags_t flags)
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
#ifndef CONFIG_MMU
static inline bool is_nommu_shared_mapping(vm_flags_t flags)
{
/*
* NOMMU shared mappings are ordinary MAP_SHARED mappings and selected
* R/O MAP_PRIVATE file mappings that are an effective R/O overlay of
* a file mapping. R/O MAP_PRIVATE mappings might still modify
* underlying memory if ptrace is active, so this is only possible if
* ptrace does not apply. Note that there is no mprotect() to upgrade
* write permissions later.
*/
return flags & VM_MAYSHARE;
}
#endif
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define SECTION_IN_PAGE_FLAGS
#endif

View File

@ -18,10 +18,11 @@ static inline int ip_mroute_opt(int opt)
int ip_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int);
int ip_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t);
int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg);
int ipmr_ioctl(struct sock *sk, int cmd, void *arg);
int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
int ip_mr_init(void);
bool ipmr_rule_default(const struct fib_rule *rule);
int ipmr_sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
#else
static inline int ip_mroute_setsockopt(struct sock *sock, int optname,
sockptr_t optval, unsigned int optlen)
@ -35,7 +36,7 @@ static inline int ip_mroute_getsockopt(struct sock *sk, int optname,
return -ENOPROTOOPT;
}
static inline int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
static inline int ipmr_ioctl(struct sock *sk, int cmd, void *arg)
{
return -ENOIOCTLCMD;
}
@ -54,6 +55,12 @@ static inline bool ipmr_rule_default(const struct fib_rule *rule)
{
return true;
}
static inline int ipmr_sk_ioctl(struct sock *sk, unsigned int cmd,
void __user *arg)
{
return 1;
}
#endif
#define VIFF_STATIC 0x8000

View File

@ -29,10 +29,10 @@ struct sock;
extern int ip6_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int);
extern int ip6_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t);
extern int ip6_mr_input(struct sk_buff *skb);
extern int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg);
extern int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
extern int ip6_mr_init(void);
extern void ip6_mr_cleanup(void);
int ip6mr_ioctl(struct sock *sk, int cmd, void *arg);
#else
static inline int ip6_mroute_setsockopt(struct sock *sock, int optname,
sockptr_t optval, unsigned int optlen)
@ -48,7 +48,7 @@ int ip6_mroute_getsockopt(struct sock *sock,
}
static inline
int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
int ip6mr_ioctl(struct sock *sk, int cmd, void *arg)
{
return -ENOIOCTLCMD;
}
@ -100,6 +100,27 @@ extern int ip6mr_get_route(struct net *net, struct sk_buff *skb,
#ifdef CONFIG_IPV6_MROUTE
bool mroute6_is_socket(struct net *net, struct sk_buff *skb);
extern int ip6mr_sk_done(struct sock *sk);
static inline int ip6mr_sk_ioctl(struct sock *sk, unsigned int cmd,
void __user *arg)
{
switch (cmd) {
/* These userspace buffers will be consumed by ip6mr_ioctl() */
case SIOCGETMIFCNT_IN6: {
struct sioc_mif_req6 buffer;
return sock_ioctl_inout(sk, cmd, arg, &buffer,
sizeof(buffer));
}
case SIOCGETSGCNT_IN6: {
struct sioc_sg_req6 buffer;
return sock_ioctl_inout(sk, cmd, arg, &buffer,
sizeof(buffer));
}
}
return 1;
}
#else
static inline bool mroute6_is_socket(struct net *net, struct sk_buff *skb)
{
@ -109,5 +130,11 @@ static inline int ip6mr_sk_done(struct sock *sk)
{
return 0;
}
static inline int ip6mr_sk_ioctl(struct sock *sk, unsigned int cmd,
void __user *arg)
{
return 1;
}
#endif
#endif

View File

@ -320,6 +320,7 @@ struct ucred {
*/
#define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */
#define MSG_SPLICE_PAGES 0x8000000 /* Splice the pages from the iterator in sendmsg() */
#define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */
#define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file
descriptor received through
@ -330,6 +331,8 @@ struct ucred {
#define MSG_CMSG_COMPAT 0 /* We never have 32 bit fixups */
#endif
/* Flags to be cleared on entry by sendmsg and sendmmsg syscalls */
#define MSG_INTERNAL_SENDMSG_FLAGS (MSG_SPLICE_PAGES)
/* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */
#define SOL_IP 0

View File

@ -49,7 +49,8 @@ struct iov_iter {
};
size_t count;
union {
const struct iovec *iov;
/* use iter_iov() to get the current vec */
const struct iovec *__iov;
const struct kvec *kvec;
const struct bio_vec *bvec;
struct xarray *xarray;
@ -66,6 +67,10 @@ struct iov_iter {
};
};
#define iter_iov(iter) (iter)->__iov
#define iter_iov_addr(iter) (iter_iov(iter)->iov_base + (iter)->iov_offset)
#define iter_iov_len(iter) (iter_iov(iter)->iov_len - (iter)->iov_offset)
static inline enum iter_type iov_iter_type(const struct iov_iter *i)
{
return i->iter_type;
@ -141,15 +146,6 @@ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs)
return ret;
}
static inline struct iovec iov_iter_iovec(const struct iov_iter *iter)
{
return (struct iovec) {
.iov_base = iter->iov->iov_base + iter->iov_offset,
.iov_len = min(iter->count,
iter->iov->iov_len - iter->iov_offset),
};
}
size_t copy_page_from_iter_atomic(struct page *page, unsigned offset,
size_t bytes, struct iov_iter *i);
void iov_iter_advance(struct iov_iter *i, size_t bytes);
@ -343,6 +339,7 @@ ssize_t __import_iovec(int type, const struct iovec __user *uvec,
struct iov_iter *i, bool compat);
int import_single_range(int type, void __user *buf, size_t len,
struct iovec *iov, struct iov_iter *i);
int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i);
static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
void __user *buf, size_t count)

View File

@ -109,4 +109,25 @@ void phonet_sysctl_exit(void);
int isi_register(void);
void isi_unregister(void);
static inline bool sk_is_phonet(struct sock *sk)
{
return sk->sk_family == PF_PHONET;
}
static inline int phonet_sk_ioctl(struct sock *sk, unsigned int cmd,
void __user *arg)
{
int karg;
switch (cmd) {
case SIOCPNADDRESOURCE:
case SIOCPNDELRESOURCE:
if (get_user(karg, (int __user *)arg))
return -EFAULT;
return sk->sk_prot->ioctl(sk, cmd, &karg);
}
/* A positive return value means that the ioctl was not processed */
return 1;
}
#endif

View File

@ -1228,7 +1228,7 @@ struct proto {
bool kern);
int (*ioctl)(struct sock *sk, int cmd,
unsigned long arg);
int *karg);
int (*init)(struct sock *sk);
void (*destroy)(struct sock *sk);
void (*shutdown)(struct sock *sk, int how);
@ -2972,6 +2972,9 @@ int sock_get_timeout(long timeo, void *optval, bool old_timeval);
int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
sockptr_t optval, int optlen, bool old_timeval);
int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
void __user *arg, void *karg, size_t size);
int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
static inline bool sk_is_readable(struct sock *sk)
{
if (sk->sk_prot->sock_is_readable)

View File

@ -342,7 +342,7 @@ void tcp_release_cb(struct sock *sk);
void tcp_wfree(struct sk_buff *skb);
void tcp_write_timer_handler(struct sock *sk);
void tcp_delack_timer_handler(struct sock *sk);
int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
int tcp_ioctl(struct sock *sk, int cmd, int *karg);
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
void tcp_rcv_space_adjust(struct sock *sk);

View File

@ -284,7 +284,7 @@ void udp_flush_pending_frames(struct sock *sk);
int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size);
void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst);
int udp_rcv(struct sk_buff *skb);
int udp_ioctl(struct sock *sk, int cmd, unsigned long arg);
int udp_ioctl(struct sock *sk, int cmd, int *karg);
int udp_init_sock(struct sock *sk);
int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
int __udp_disconnect(struct sock *sk, int flags);

View File

@ -360,19 +360,18 @@ TRACE_EVENT(io_uring_complete,
);
/**
* io_uring_submit_sqe - called before submitting one SQE
* io_uring_submit_req - called before submitting a request
*
* @req: pointer to a submitted request
* @force_nonblock: whether a context blocking or not
*
* Allows to track SQE submitting, to understand what was the source of it, SQ
* thread or io_uring_enter call.
*/
TRACE_EVENT(io_uring_submit_sqe,
TRACE_EVENT(io_uring_submit_req,
TP_PROTO(struct io_kiocb *req, bool force_nonblock),
TP_PROTO(struct io_kiocb *req),
TP_ARGS(req, force_nonblock),
TP_ARGS(req),
TP_STRUCT__entry (
__field( void *, ctx )
@ -380,7 +379,6 @@ TRACE_EVENT(io_uring_submit_sqe,
__field( unsigned long long, user_data )
__field( u8, opcode )
__field( u32, flags )
__field( bool, force_nonblock )
__field( bool, sq_thread )
__string( op_str, io_uring_get_opcode(req->opcode) )
@ -392,16 +390,15 @@ TRACE_EVENT(io_uring_submit_sqe,
__entry->user_data = req->cqe.user_data;
__entry->opcode = req->opcode;
__entry->flags = req->flags;
__entry->force_nonblock = force_nonblock;
__entry->sq_thread = req->ctx->flags & IORING_SETUP_SQPOLL;
__assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, "
"non block %d, sq_thread %d", __entry->ctx, __entry->req,
"sq_thread %d", __entry->ctx, __entry->req,
__entry->user_data, __get_str(op_str),
__entry->flags, __entry->force_nonblock, __entry->sq_thread)
__entry->flags, __entry->sq_thread)
);
/*

View File

@ -173,6 +173,23 @@ enum {
*/
#define IORING_SETUP_DEFER_TASKRUN (1U << 13)
/*
* Application provides the memory for the rings
*/
#define IORING_SETUP_NO_MMAP (1U << 14)
/*
* Register the ring fd in itself for use with
* IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather
* than an fd.
*/
#define IORING_SETUP_REGISTERED_FD_ONLY (1U << 15)
/*
* Removes indirection through the SQ index array.
*/
#define IORING_SETUP_NO_SQARRAY (1U << 16)
enum io_uring_op {
IORING_OP_NOP,
IORING_OP_READV,
@ -252,6 +269,7 @@ enum io_uring_op {
#define IORING_TIMEOUT_REALTIME (1U << 3)
#define IORING_LINK_TIMEOUT_UPDATE (1U << 4)
#define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5)
#define IORING_TIMEOUT_MULTISHOT (1U << 6)
#define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
#define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
/*
@ -286,11 +304,15 @@ enum io_uring_op {
* request 'user_data'
* IORING_ASYNC_CANCEL_ANY Match any request
* IORING_ASYNC_CANCEL_FD_FIXED 'fd' passed in is a fixed descriptor
* IORING_ASYNC_CANCEL_USERDATA Match on user_data, default for no other key
* IORING_ASYNC_CANCEL_OP Match request based on opcode
*/
#define IORING_ASYNC_CANCEL_ALL (1U << 0)
#define IORING_ASYNC_CANCEL_FD (1U << 1)
#define IORING_ASYNC_CANCEL_ANY (1U << 2)
#define IORING_ASYNC_CANCEL_FD_FIXED (1U << 3)
#define IORING_ASYNC_CANCEL_USERDATA (1U << 4)
#define IORING_ASYNC_CANCEL_OP (1U << 5)
/*
* send/sendmsg and recv/recvmsg flags (sqe->ioprio)
@ -349,6 +371,8 @@ enum {
* applicable for IORING_MSG_DATA, obviously.
*/
#define IORING_MSG_RING_CQE_SKIP (1U << 0)
/* Pass through the flags from sqe->file_index to cqe->flags */
#define IORING_MSG_RING_FLAGS_PASS (1U << 1)
/*
* IO completion data structure (Completion Queue Entry)
@ -389,6 +413,9 @@ enum {
#define IORING_OFF_SQ_RING 0ULL
#define IORING_OFF_CQ_RING 0x8000000ULL
#define IORING_OFF_SQES 0x10000000ULL
#define IORING_OFF_PBUF_RING 0x80000000ULL
#define IORING_OFF_PBUF_SHIFT 16
#define IORING_OFF_MMAP_MASK 0xf8000000ULL
/*
* Filled with the offset for mmap(2)
@ -402,7 +429,7 @@ struct io_sqring_offsets {
__u32 dropped;
__u32 array;
__u32 resv1;
__u64 resv2;
__u64 user_addr;
};
/*
@ -421,7 +448,7 @@ struct io_cqring_offsets {
__u32 cqes;
__u32 flags;
__u32 resv1;
__u64 resv2;
__u64 user_addr;
};
/*
@ -472,6 +499,7 @@ struct io_uring_params {
#define IORING_FEAT_RSRC_TAGS (1U << 10)
#define IORING_FEAT_CQE_SKIP (1U << 11)
#define IORING_FEAT_LINKED_FILE (1U << 12)
#define IORING_FEAT_REG_REG_RING (1U << 13)
/*
* io_uring_register(2) opcodes and arguments
@ -519,7 +547,10 @@ enum {
IORING_REGISTER_FILE_ALLOC_RANGE = 25,
/* this goes last */
IORING_REGISTER_LAST
IORING_REGISTER_LAST,
/* flag added to the opcode to use a registered ring fd */
IORING_REGISTER_USE_REGISTERED_RING = 1U << 31
};
/* io-wq worker categories */
@ -564,19 +595,6 @@ struct io_uring_rsrc_update2 {
__u32 resv2;
};
struct io_uring_notification_slot {
__u64 tag;
__u64 resv[3];
};
struct io_uring_notification_register {
__u32 nr_slots;
__u32 resv;
__u64 resv2;
__u64 data;
__u64 resv3;
};
/* Skip updating fd indexes set to this value in the fd table */
#define IORING_REGISTER_FILES_SKIP (-2)
@ -631,12 +649,26 @@ struct io_uring_buf_ring {
};
};
/*
* Flags for IORING_REGISTER_PBUF_RING.
*
* IOU_PBUF_RING_MMAP: If set, kernel will allocate the memory for the ring.
* The application must not set a ring_addr in struct
* io_uring_buf_reg, instead it must subsequently call
* mmap(2) with the offset set as:
* IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT)
* to get a virtual mapping for the ring.
*/
enum {
IOU_PBUF_RING_MMAP = 1,
};
/* argument for IORING_(UN)REGISTER_PBUF_RING */
struct io_uring_buf_reg {
__u64 ring_addr;
__u32 ring_entries;
__u16 bgid;
__u16 pad;
__u16 flags;
__u64 resv[3];
};
@ -674,7 +706,9 @@ struct io_uring_sync_cancel_reg {
__s32 fd;
__u32 flags;
struct __kernel_timespec timeout;
__u64 pad[4];
__u8 opcode;
__u8 pad[7];
__u64 pad2[3];
};
/*
@ -694,6 +728,14 @@ struct io_uring_recvmsg_out {
__u32 flags;
};
/*
* Argument for IORING_OP_URING_CMD when file is a socket
*/
enum {
SOCKET_URING_OP_SIOCINQ = 0,
SOCKET_URING_OP_SIOCOUTQ,
};
#ifdef __cplusplus
}
#endif

View File

@ -39,6 +39,7 @@ int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
ma->addr = READ_ONCE(sqe->addr);
ma->len = READ_ONCE(sqe->len);
ma->advice = READ_ONCE(sqe->fadvise_advice);
req->flags |= REQ_F_FORCE_ASYNC;
return 0;
#else
return -EOPNOTSUPP;
@ -51,8 +52,7 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise);
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice);
io_req_set_res(req, ret, 0);
@ -62,6 +62,18 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags)
#endif
}
static bool io_fadvise_force_async(struct io_fadvise *fa)
{
switch (fa->advice) {
case POSIX_FADV_NORMAL:
case POSIX_FADV_RANDOM:
case POSIX_FADV_SEQUENTIAL:
return false;
default:
return true;
}
}
int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise);
@ -72,6 +84,8 @@ int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
fa->offset = READ_ONCE(sqe->off);
fa->len = READ_ONCE(sqe->len);
fa->advice = READ_ONCE(sqe->fadvise_advice);
if (io_fadvise_force_async(fa))
req->flags |= REQ_F_FORCE_ASYNC;
return 0;
}
@ -80,16 +94,7 @@ int io_fadvise(struct io_kiocb *req, unsigned int issue_flags)
struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise);
int ret;
if (issue_flags & IO_URING_F_NONBLOCK) {
switch (fa->advice) {
case POSIX_FADV_NORMAL:
case POSIX_FADV_RANDOM:
case POSIX_FADV_SEQUENTIAL:
break;
default:
return -EAGAIN;
}
}
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK && io_fadvise_force_async(fa));
ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
if (ret < 0)

View File

@ -7,47 +7,60 @@
#define IO_ALLOC_CACHE_MAX 512
struct io_cache_entry {
struct hlist_node node;
struct io_wq_work_node node;
};
static inline bool io_alloc_cache_put(struct io_alloc_cache *cache,
struct io_cache_entry *entry)
{
if (cache->nr_cached < IO_ALLOC_CACHE_MAX) {
if (cache->nr_cached < cache->max_cached) {
cache->nr_cached++;
hlist_add_head(&entry->node, &cache->list);
wq_stack_add_head(&entry->node, &cache->list);
/* KASAN poisons object */
kasan_slab_free_mempool(entry);
return true;
}
return false;
}
static inline bool io_alloc_cache_empty(struct io_alloc_cache *cache)
{
return !cache->list.next;
}
static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache)
{
if (!hlist_empty(&cache->list)) {
struct hlist_node *node = cache->list.first;
if (cache->list.next) {
struct io_cache_entry *entry;
hlist_del(node);
entry = container_of(cache->list.next, struct io_cache_entry, node);
kasan_unpoison_range(entry, cache->elem_size);
cache->list.next = cache->list.next->next;
cache->nr_cached--;
return container_of(node, struct io_cache_entry, node);
return entry;
}
return NULL;
}
static inline void io_alloc_cache_init(struct io_alloc_cache *cache)
static inline void io_alloc_cache_init(struct io_alloc_cache *cache,
unsigned max_nr, size_t size)
{
INIT_HLIST_HEAD(&cache->list);
cache->list.next = NULL;
cache->nr_cached = 0;
cache->max_cached = max_nr;
cache->elem_size = size;
}
static inline void io_alloc_cache_free(struct io_alloc_cache *cache,
void (*free)(struct io_cache_entry *))
{
while (!hlist_empty(&cache->list)) {
struct hlist_node *node = cache->list.first;
while (1) {
struct io_cache_entry *entry = io_alloc_cache_get(cache);
hlist_del(node);
free(container_of(node, struct io_cache_entry, node));
if (!entry)
break;
free(entry);
}
cache->nr_cached = 0;
}

View File

@ -22,33 +22,54 @@ struct io_cancel {
u64 addr;
u32 flags;
s32 fd;
u8 opcode;
};
#define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \
IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED)
IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED | \
IORING_ASYNC_CANCEL_USERDATA | IORING_ASYNC_CANCEL_OP)
/*
* Returns true if the request matches the criteria outlined by 'cd'.
*/
bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd)
{
bool match_user_data = cd->flags & IORING_ASYNC_CANCEL_USERDATA;
if (req->ctx != cd->ctx)
return false;
if (!(cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP)))
match_user_data = true;
if (cd->flags & IORING_ASYNC_CANCEL_ANY)
goto check_seq;
if (cd->flags & IORING_ASYNC_CANCEL_FD) {
if (req->file != cd->file)
return false;
}
if (cd->flags & IORING_ASYNC_CANCEL_OP) {
if (req->opcode != cd->opcode)
return false;
}
if (match_user_data && req->cqe.user_data != cd->data)
return false;
if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
check_seq:
if (cd->seq == req->work.cancel_seq)
return false;
req->work.cancel_seq = cd->seq;
}
return true;
}
static bool io_cancel_cb(struct io_wq_work *work, void *data)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
struct io_cancel_data *cd = data;
if (req->ctx != cd->ctx)
return false;
if (cd->flags & IORING_ASYNC_CANCEL_ANY) {
;
} else if (cd->flags & IORING_ASYNC_CANCEL_FD) {
if (req->file != cd->file)
return false;
} else {
if (req->cqe.user_data != cd->data)
return false;
}
if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
if (cd->seq == req->work.cancel_seq)
return false;
req->work.cancel_seq = cd->seq;
}
return true;
return io_cancel_req_match(req, cd);
}
static int io_async_cancel_one(struct io_uring_task *tctx,
@ -111,7 +132,7 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(req->flags & REQ_F_BUFFER_SELECT))
return -EINVAL;
if (sqe->off || sqe->len || sqe->splice_fd_in)
if (sqe->off || sqe->splice_fd_in)
return -EINVAL;
cancel->addr = READ_ONCE(sqe->addr);
@ -123,6 +144,11 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EINVAL;
cancel->fd = READ_ONCE(sqe->fd);
}
if (cancel->flags & IORING_ASYNC_CANCEL_OP) {
if (cancel->flags & IORING_ASYNC_CANCEL_ANY)
return -EINVAL;
cancel->opcode = READ_ONCE(sqe->len);
}
return 0;
}
@ -169,6 +195,7 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
.ctx = req->ctx,
.data = cancel->addr,
.flags = cancel->flags,
.opcode = cancel->opcode,
.seq = atomic_inc_return(&req->ctx->cancel_seq),
};
struct io_uring_task *tctx = req->task->io_uring;
@ -216,13 +243,10 @@ static int __io_sync_cancel(struct io_uring_task *tctx,
/* fixed must be grabbed every time since we drop the uring_lock */
if ((cd->flags & IORING_ASYNC_CANCEL_FD) &&
(cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
unsigned long file_ptr;
if (unlikely(fd >= ctx->nr_user_files))
return -EBADF;
fd = array_index_nospec(fd, ctx->nr_user_files);
file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
cd->file = (struct file *) (file_ptr & FFS_MASK);
cd->file = io_file_from_index(&ctx->file_table, fd);
if (!cd->file)
return -EBADF;
}
@ -241,17 +265,22 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
struct io_uring_sync_cancel_reg sc;
struct fd f = { };
DEFINE_WAIT(wait);
int ret;
int ret, i;
if (copy_from_user(&sc, arg, sizeof(sc)))
return -EFAULT;
if (sc.flags & ~CANCEL_FLAGS)
return -EINVAL;
if (sc.pad[0] || sc.pad[1] || sc.pad[2] || sc.pad[3])
return -EINVAL;
for (i = 0; i < ARRAY_SIZE(sc.pad); i++)
if (sc.pad[i])
return -EINVAL;
for (i = 0; i < ARRAY_SIZE(sc.pad2); i++)
if (sc.pad2[i])
return -EINVAL;
cd.data = sc.addr;
cd.flags = sc.flags;
cd.opcode = sc.opcode;
/* we can grab a normal file descriptor upfront */
if ((cd.flags & IORING_ASYNC_CANCEL_FD) &&

View File

@ -8,11 +8,11 @@ struct io_cancel_data {
u64 data;
struct file *file;
};
u8 opcode;
u32 flags;
int seq;
};
int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags);
@ -21,3 +21,4 @@ int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd,
void init_hash_table(struct io_hash_table *table, unsigned size);
int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg);
bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd);

View File

@ -48,10 +48,13 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
return 0;
}
static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
struct seq_file *m)
/*
* Caller holds a reference to the file already, we don't need to do
* anything else to get an extra reference.
*/
__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
{
struct io_sq_data *sq = NULL;
struct io_ring_ctx *ctx = f->private_data;
struct io_overflow_cqe *ocqe;
struct io_rings *r = ctx->rings;
unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
@ -62,6 +65,7 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
unsigned int cq_shift = 0;
unsigned int sq_shift = 0;
unsigned int sq_entries, cq_entries;
int sq_pid = -1, sq_cpu = -1;
bool has_lock;
unsigned int i;
@ -91,6 +95,8 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
struct io_uring_sqe *sqe;
unsigned int sq_idx;
if (ctx->flags & IORING_SETUP_NO_SQARRAY)
break;
sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
if (sq_idx > sq_mask)
continue;
@ -139,13 +145,19 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
has_lock = mutex_trylock(&ctx->uring_lock);
if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
sq = ctx->sq_data;
if (!sq->thread)
sq = NULL;
struct io_sq_data *sq = ctx->sq_data;
if (mutex_trylock(&sq->lock)) {
if (sq->thread) {
sq_pid = task_pid_nr(sq->thread);
sq_cpu = task_cpu(sq->thread);
}
mutex_unlock(&sq->lock);
}
}
seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1);
seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1);
seq_printf(m, "SqThread:\t%d\n", sq_pid);
seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu);
seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
struct file *f = io_file_from_index(&ctx->file_table, i);
@ -205,14 +217,4 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx,
spin_unlock(&ctx->completion_lock);
}
__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
{
struct io_ring_ctx *ctx = f->private_data;
if (percpu_ref_tryget(&ctx->refs)) {
__io_uring_show_fdinfo(ctx, m);
percpu_ref_put(&ctx->refs);
}
}
#endif

View File

@ -64,7 +64,6 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
u32 slot_index)
__must_hold(&req->ctx->uring_lock)
{
bool needs_switch = false;
struct io_fixed_file *file_slot;
int ret;
@ -79,20 +78,13 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
if (file_slot->file_ptr) {
struct file *old_file;
ret = io_rsrc_node_switch_start(ctx);
if (ret)
goto err;
old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
ctx->rsrc_node, old_file);
io_slot_file(file_slot));
if (ret)
goto err;
return ret;
file_slot->file_ptr = 0;
io_file_bitmap_clear(&ctx->file_table, slot_index);
needs_switch = true;
}
ret = io_scm_file_account(ctx, file);
@ -101,9 +93,6 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
io_fixed_file_set(file_slot, file);
io_file_bitmap_set(&ctx->file_table, slot_index);
}
err:
if (needs_switch)
io_rsrc_node_switch(ctx, ctx->file_data);
return ret;
}
@ -149,30 +138,25 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
{
struct io_fixed_file *file_slot;
struct file *file;
int ret;
if (unlikely(!ctx->file_data))
return -ENXIO;
if (offset >= ctx->nr_user_files)
return -EINVAL;
ret = io_rsrc_node_switch_start(ctx);
if (ret)
return ret;
offset = array_index_nospec(offset, ctx->nr_user_files);
file_slot = io_fixed_file_slot(&ctx->file_table, offset);
if (!file_slot->file_ptr)
return -EBADF;
file = (struct file *)(file_slot->file_ptr & FFS_MASK);
ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file);
ret = io_queue_rsrc_removal(ctx->file_data, offset,
io_slot_file(file_slot));
if (ret)
return ret;
file_slot->file_ptr = 0;
io_file_bitmap_clear(&ctx->file_table, offset);
io_rsrc_node_switch(ctx, ctx->file_data);
return 0;
}

View File

@ -5,10 +5,6 @@
#include <linux/file.h>
#include <linux/io_uring_types.h>
#define FFS_NOWAIT 0x1UL
#define FFS_ISREG 0x2UL
#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG)
bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files);
void io_free_file_tables(struct io_file_table *table);
@ -43,21 +39,31 @@ io_fixed_file_slot(struct io_file_table *table, unsigned i)
return &table->files[i];
}
#define FFS_NOWAIT 0x1UL
#define FFS_ISREG 0x2UL
#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG)
static inline unsigned int io_slot_flags(struct io_fixed_file *slot)
{
return (slot->file_ptr & ~FFS_MASK) << REQ_F_SUPPORT_NOWAIT_BIT;
}
static inline struct file *io_slot_file(struct io_fixed_file *slot)
{
return (struct file *)(slot->file_ptr & FFS_MASK);
}
static inline struct file *io_file_from_index(struct io_file_table *table,
int index)
{
struct io_fixed_file *slot = io_fixed_file_slot(table, index);
return (struct file *) (slot->file_ptr & FFS_MASK);
return io_slot_file(io_fixed_file_slot(table, index));
}
static inline void io_fixed_file_set(struct io_fixed_file *file_slot,
struct file *file)
{
unsigned long file_ptr = (unsigned long) file;
file_ptr |= io_file_get_flags(file);
file_slot->file_ptr = file_ptr;
file_slot->file_ptr = (unsigned long)file |
(io_file_get_flags(file) >> REQ_F_SUPPORT_NOWAIT_BIT);
}
static inline void io_reset_alloc_hint(struct io_ring_ctx *ctx)

View File

@ -74,6 +74,7 @@ int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
}
req->flags |= REQ_F_NEED_CLEANUP;
req->flags |= REQ_F_FORCE_ASYNC;
return 0;
}
@ -82,8 +83,7 @@ int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
struct io_rename *ren = io_kiocb_to_cmd(req, struct io_rename);
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd,
ren->newpath, ren->flags);
@ -123,6 +123,7 @@ int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return PTR_ERR(un->filename);
req->flags |= REQ_F_NEED_CLEANUP;
req->flags |= REQ_F_FORCE_ASYNC;
return 0;
}
@ -131,8 +132,7 @@ int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
struct io_unlink *un = io_kiocb_to_cmd(req, struct io_unlink);
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
if (un->flags & AT_REMOVEDIR)
ret = do_rmdir(un->dfd, un->filename);
@ -170,6 +170,7 @@ int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return PTR_ERR(mkd->filename);
req->flags |= REQ_F_NEED_CLEANUP;
req->flags |= REQ_F_FORCE_ASYNC;
return 0;
}
@ -178,8 +179,7 @@ int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags)
struct io_mkdir *mkd = io_kiocb_to_cmd(req, struct io_mkdir);
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
@ -220,6 +220,7 @@ int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
}
req->flags |= REQ_F_NEED_CLEANUP;
req->flags |= REQ_F_FORCE_ASYNC;
return 0;
}
@ -228,8 +229,7 @@ int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags)
struct io_link *sl = io_kiocb_to_cmd(req, struct io_link);
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath);
@ -243,7 +243,7 @@ int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_link *lnk = io_kiocb_to_cmd(req, struct io_link);
const char __user *oldf, *newf;
if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)
if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
if (unlikely(req->flags & REQ_F_FIXED_FILE))
return -EBADF;
@ -265,6 +265,7 @@ int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
}
req->flags |= REQ_F_NEED_CLEANUP;
req->flags |= REQ_F_FORCE_ASYNC;
return 0;
}
@ -273,8 +274,7 @@ int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
struct io_link *lnk = io_kiocb_to_cmd(req, struct io_link);
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd,
lnk->newpath, lnk->flags);

File diff suppressed because it is too large Load Diff

View File

@ -50,8 +50,9 @@ void io_wq_put_and_exit(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_hash_work(struct io_wq_work *work, void *val);
int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask);
int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask);
int io_wq_max_workers(struct io_wq *wq, int *new_count);
bool io_wq_worker_stopped(void);
static inline bool io_wq_is_hashed(struct io_wq_work *work)
{

File diff suppressed because it is too large Load Diff

View File

@ -4,6 +4,7 @@
#include <linux/errno.h>
#include <linux/lockdep.h>
#include <linux/resume_user_mode.h>
#include <linux/kasan.h>
#include <linux/io_uring_types.h>
#include <uapi/linux/eventpoll.h>
#include "io-wq.h"
@ -14,6 +15,17 @@
#include <trace/events/io_uring.h>
#endif
enum {
/*
* A hint to not wake right away but delay until there are enough of
* tw's queued to match the number of CQEs the task is waiting for.
*
* Must not be used wirh requests generating more than one CQE.
* It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set.
*/
IOU_F_TWQ_LAZY_WAKE = 1,
};
enum {
IOU_OK = 0,
IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED,
@ -26,16 +38,13 @@ enum {
IOU_STOP_MULTISHOT = -ECANCELED,
};
struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow);
bool io_req_cqe_overflow(struct io_kiocb *req);
bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow);
void io_req_cqe_overflow(struct io_kiocb *req);
int io_run_task_work_sig(struct io_ring_ctx *ctx);
int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked);
int io_run_local_work(struct io_ring_ctx *ctx);
void io_req_defer_failed(struct io_kiocb *req, s32 res);
void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags);
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags,
bool allow_overflow);
bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags);
void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
@ -44,28 +53,26 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd);
struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned issue_flags);
static inline bool io_req_ffs_set(struct io_kiocb *req)
{
return req->flags & REQ_F_FIXED_FILE;
}
void __io_req_task_work_add(struct io_kiocb *req, bool allow_local);
void __io_req_task_work_add(struct io_kiocb *req, unsigned flags);
bool io_is_uring_fops(struct file *file);
bool io_alloc_async_data(struct io_kiocb *req);
void io_req_task_queue(struct io_kiocb *req);
void io_queue_iowq(struct io_kiocb *req, bool *dont_use);
void io_req_task_complete(struct io_kiocb *req, bool *locked);
void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use);
void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts);
void io_req_task_queue_fail(struct io_kiocb *req, int ret);
void io_req_task_submit(struct io_kiocb *req, bool *locked);
void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts);
void tctx_task_work(struct callback_head *cb);
__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
int io_uring_alloc_task_context(struct task_struct *task,
struct io_ring_ctx *ctx);
int io_poll_issue(struct io_kiocb *req, bool *locked);
int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
int start, int end);
int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts);
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node);
void __io_submit_flush_completions(struct io_ring_ctx *ctx);
int io_req_prep_async(struct io_kiocb *req);
struct io_wq_work *io_wq_free_work(struct io_wq_work *work);
@ -73,61 +80,73 @@ void io_wq_submit_work(struct io_wq_work *work);
void io_free_req(struct io_kiocb *req);
void io_queue_next(struct io_kiocb *req);
void __io_put_task(struct task_struct *task, int nr);
void io_task_refs_refill(struct io_uring_task *tctx);
bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
bool cancel_all);
#define io_lockdep_assert_cq_locked(ctx) \
do { \
if (ctx->flags & IORING_SETUP_IOPOLL) { \
lockdep_assert_held(&ctx->uring_lock); \
} else if (!ctx->task_complete) { \
lockdep_assert_held(&ctx->completion_lock); \
} else if (ctx->submitter_task->flags & PF_EXITING) { \
lockdep_assert(current_work()); \
} else { \
lockdep_assert(current == ctx->submitter_task); \
} \
} while (0)
#if defined(CONFIG_PROVE_LOCKING)
static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
{
lockdep_assert(in_task());
if (ctx->flags & IORING_SETUP_IOPOLL) {
lockdep_assert_held(&ctx->uring_lock);
} else if (!ctx->task_complete) {
lockdep_assert_held(&ctx->completion_lock);
} else if (ctx->submitter_task) {
/*
* ->submitter_task may be NULL and we can still post a CQE,
* if the ring has been setup with IORING_SETUP_R_DISABLED.
* Not from an SQE, as those cannot be submitted, but via
* updating tagged resources.
*/
if (ctx->submitter_task->flags & PF_EXITING)
lockdep_assert(current_work());
else
lockdep_assert(current == ctx->submitter_task);
}
}
#else
static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
{
}
#endif
static inline void io_req_task_work_add(struct io_kiocb *req)
{
__io_req_task_work_add(req, true);
__io_req_task_work_add(req, 0);
}
#define io_for_each_link(pos, head) \
for (pos = (head); pos; pos = pos->link)
void io_cq_unlock_post(struct io_ring_ctx *ctx);
static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx,
bool overflow)
static inline bool io_get_cqe_overflow(struct io_ring_ctx *ctx,
struct io_uring_cqe **ret,
bool overflow)
{
io_lockdep_assert_cq_locked(ctx);
if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) {
struct io_uring_cqe *cqe = ctx->cqe_cached;
ctx->cached_cq_tail++;
ctx->cqe_cached++;
if (ctx->flags & IORING_SETUP_CQE32)
ctx->cqe_cached++;
return cqe;
if (unlikely(ctx->cqe_cached >= ctx->cqe_sentinel)) {
if (unlikely(!io_cqe_cache_refill(ctx, overflow)))
return false;
}
return __io_get_cqe(ctx, overflow);
*ret = ctx->cqe_cached;
ctx->cached_cq_tail++;
ctx->cqe_cached++;
if (ctx->flags & IORING_SETUP_CQE32)
ctx->cqe_cached++;
return true;
}
static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret)
{
return io_get_cqe_overflow(ctx, false);
return io_get_cqe_overflow(ctx, ret, false);
}
static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
struct io_kiocb *req)
static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
struct io_kiocb *req)
{
struct io_uring_cqe *cqe;
@ -136,39 +155,22 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
* submission (by quite a lot). Increment the overflow count in
* the ring.
*/
cqe = io_get_cqe(ctx);
if (unlikely(!cqe))
if (unlikely(!io_get_cqe(ctx, &cqe)))
return false;
trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
req->cqe.res, req->cqe.flags,
(req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0,
(req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0);
if (trace_io_uring_complete_enabled())
trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
req->cqe.res, req->cqe.flags,
req->big_cqe.extra1, req->big_cqe.extra2);
memcpy(cqe, &req->cqe, sizeof(*cqe));
if (ctx->flags & IORING_SETUP_CQE32) {
u64 extra1 = 0, extra2 = 0;
if (req->flags & REQ_F_CQE32_INIT) {
extra1 = req->extra1;
extra2 = req->extra2;
}
WRITE_ONCE(cqe->big_cqe[0], extra1);
WRITE_ONCE(cqe->big_cqe[1], extra2);
memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe));
memset(&req->big_cqe, 0, sizeof(req->big_cqe));
}
return true;
}
static inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
struct io_kiocb *req)
{
if (likely(__io_fill_cqe_req(ctx, req)))
return true;
return io_req_cqe_overflow(req);
}
static inline void req_set_fail(struct io_kiocb *req)
{
req->flags |= REQ_F_FAIL;
@ -189,10 +191,10 @@ static inline bool req_has_async_data(struct io_kiocb *req)
return req->flags & REQ_F_ASYNC_DATA;
}
static inline void io_put_file(struct file *file)
static inline void io_put_file(struct io_kiocb *req)
{
if (file)
fput(file);
if (!(req->flags & REQ_F_FIXED_FILE) && req->file)
fput(req->file);
}
static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx,
@ -223,8 +225,14 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx)
smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail);
}
/* requires smb_mb() prior, see wq_has_sleeper() */
static inline void __io_cqring_wake(struct io_ring_ctx *ctx)
static inline void io_poll_wq_wake(struct io_ring_ctx *ctx)
{
if (wq_has_sleeper(&ctx->poll_wq))
__wake_up(&ctx->poll_wq, TASK_NORMAL, 0,
poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
}
static inline void io_cqring_wake(struct io_ring_ctx *ctx)
{
/*
* Trigger waitqueue handler on all waiters on our waitqueue. This
@ -236,17 +244,11 @@ static inline void __io_cqring_wake(struct io_ring_ctx *ctx)
* waitqueue handlers, we know we have a dependency between eventfd or
* epoll and should terminate multishot poll at that point.
*/
if (waitqueue_active(&ctx->cq_wait))
if (wq_has_sleeper(&ctx->cq_wait))
__wake_up(&ctx->cq_wait, TASK_NORMAL, 0,
poll_to_key(EPOLL_URING_WAKE | EPOLLIN));
}
static inline void io_cqring_wake(struct io_ring_ctx *ctx)
{
smp_mb();
__io_cqring_wake(ctx);
}
static inline bool io_sqring_full(struct io_ring_ctx *ctx)
{
struct io_rings *r = ctx->rings;
@ -257,9 +259,11 @@ static inline bool io_sqring_full(struct io_ring_ctx *ctx)
static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
unsigned int entries;
/* make sure SQ entry isn't read before tail */
return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
entries = smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
return min(entries, ctx->sq_entries);
}
static inline int io_run_task_work(void)
@ -294,47 +298,11 @@ static inline bool io_task_work_pending(struct io_ring_ctx *ctx)
return task_work_pending(current) || !wq_list_empty(&ctx->work_llist);
}
static inline int io_run_task_work_ctx(struct io_ring_ctx *ctx)
static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts)
{
int ret = 0;
int ret2;
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
ret = io_run_local_work(ctx);
/* want to run this after in case more is added */
ret2 = io_run_task_work();
/* Try propagate error in favour of if tasks were run,
* but still make sure to run them if requested
*/
if (ret >= 0)
ret += ret2;
return ret;
}
static inline int io_run_local_work_locked(struct io_ring_ctx *ctx)
{
bool locked;
int ret;
if (llist_empty(&ctx->work_llist))
return 0;
locked = true;
ret = __io_run_local_work(ctx, &locked);
/* shouldn't happen! */
if (WARN_ON_ONCE(!locked))
if (!ts->locked) {
mutex_lock(&ctx->uring_lock);
return ret;
}
static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked)
{
if (!*locked) {
mutex_lock(&ctx->uring_lock);
*locked = true;
ts->locked = true;
}
}
@ -355,19 +323,11 @@ static inline void io_req_complete_defer(struct io_kiocb *req)
static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
if (unlikely(ctx->off_timeout_used || ctx->drain_active || ctx->has_evfd))
if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
ctx->has_evfd || ctx->poll_activated))
__io_commit_cqring_flush(ctx);
}
/* must to be called somewhat shortly after putting a request */
static inline void io_put_task(struct task_struct *task, int nr)
{
if (likely(task == current))
task->io_uring->cached_refs += nr;
else
__io_put_task(task, nr);
}
static inline void io_get_task_refs(int nr)
{
struct io_uring_task *tctx = current->io_uring;
@ -382,19 +342,30 @@ static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
return !ctx->submit_state.free_list.next;
}
static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
extern struct kmem_cache *req_cachep;
static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx)
{
if (unlikely(io_req_cache_empty(ctx)))
return __io_alloc_req_refill(ctx);
struct io_kiocb *req;
req = container_of(ctx->submit_state.free_list.next, struct io_kiocb, comp_list);
wq_stack_extract(&ctx->submit_state.free_list);
return req;
}
static inline bool io_alloc_req(struct io_ring_ctx *ctx, struct io_kiocb **req)
{
if (unlikely(io_req_cache_empty(ctx))) {
if (!__io_alloc_req_refill(ctx))
return false;
}
*req = io_extract_req(ctx);
return true;
}
static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
static inline bool io_allowed_defer_tw_run(struct io_ring_ctx *ctx)
{
struct io_wq_work_node *node;
node = wq_stack_extract(&ctx->submit_state.free_list);
return container_of(node, struct io_kiocb, comp_list);
return likely(ctx->submitter_task == current);
}
static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx)
@ -410,4 +381,14 @@ static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res)
io_req_task_work_add(req);
}
/*
* IORING_SETUP_SQE128 contexts allocate twice the normal SQE size for each
* slot.
*/
static inline size_t uring_sqe_size(struct io_ring_ctx *ctx)
{
if (ctx->flags & IORING_SETUP_SQE128)
return 2 * sizeof(struct io_uring_sqe);
return sizeof(struct io_uring_sqe);
}
#endif

View File

@ -137,7 +137,8 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
return NULL;
head &= bl->mask;
if (head < IO_BUFFER_LIST_BUF_PER_PAGE) {
/* mmaped buffers are always contig */
if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) {
buf = &br->bufs[head];
} else {
int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1);
@ -179,7 +180,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
bl = io_buffer_get_list(ctx, req->buf_index);
if (likely(bl)) {
if (bl->buf_nr_pages)
if (bl->is_mapped)
ret = io_ring_buffer_select(req, len, bl, issue_flags);
else
ret = io_provided_buffer_select(req, len, bl);
@ -214,17 +215,24 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
if (!nbufs)
return 0;
if (bl->buf_nr_pages) {
int j;
if (bl->is_mapped) {
i = bl->buf_ring->tail - bl->head;
for (j = 0; j < bl->buf_nr_pages; j++)
unpin_user_page(bl->buf_pages[j]);
kvfree(bl->buf_pages);
bl->buf_pages = NULL;
bl->buf_nr_pages = 0;
if (bl->is_mmap) {
folio_put(virt_to_folio(bl->buf_ring));
bl->buf_ring = NULL;
bl->is_mmap = 0;
} else if (bl->buf_nr_pages) {
int j;
for (j = 0; j < bl->buf_nr_pages; j++)
unpin_user_page(bl->buf_pages[j]);
kvfree(bl->buf_pages);
bl->buf_pages = NULL;
bl->buf_nr_pages = 0;
}
/* make sure it's seen as empty */
INIT_LIST_HEAD(&bl->buf_list);
bl->is_mapped = 0;
return i;
}
@ -304,7 +312,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
if (bl) {
ret = -EINVAL;
/* can't use provide/remove buffers command on mapped buffers */
if (!bl->buf_nr_pages)
if (!bl->is_mapped)
ret = __io_remove_buffers(ctx, bl, p->nbufs);
}
io_ring_submit_unlock(ctx, issue_flags);
@ -449,7 +457,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
}
}
/* can't add buffers via this command for a mapped buffer ring */
if (bl->buf_nr_pages) {
if (bl->is_mapped) {
ret = -EINVAL;
goto err;
}
@ -464,23 +472,98 @@ err:
return IOU_OK;
}
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
struct io_buffer_list *bl)
{
struct io_uring_buf_ring *br;
struct page **pages;
int i, nr_pages;
pages = io_pin_pages(reg->ring_addr,
flex_array_size(br, bufs, reg->ring_entries),
&nr_pages);
if (IS_ERR(pages))
return PTR_ERR(pages);
/*
* Apparently some 32-bit boxes (ARM) will return highmem pages,
* which then need to be mapped. We could support that, but it'd
* complicate the code and slowdown the common cases quite a bit.
* So just error out, returning -EINVAL just like we did on kernels
* that didn't support mapped buffer rings.
*/
for (i = 0; i < nr_pages; i++)
if (PageHighMem(pages[i]))
goto error_unpin;
br = page_address(pages[0]);
#ifdef SHM_COLOUR
/*
* On platforms that have specific aliasing requirements, SHM_COLOUR
* is set and we must guarantee that the kernel and user side align
* nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
* the application mmap's the provided ring buffer. Fail the request
* if we, by chance, don't end up with aligned addresses. The app
* should use IOU_PBUF_RING_MMAP instead, and liburing will handle
* this transparently.
*/
if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1))
goto error_unpin;
#endif
bl->buf_pages = pages;
bl->buf_nr_pages = nr_pages;
bl->buf_ring = br;
bl->is_mapped = 1;
bl->is_mmap = 0;
return 0;
error_unpin:
for (i = 0; i < nr_pages; i++)
unpin_user_page(pages[i]);
kvfree(pages);
return -EINVAL;
}
static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg,
struct io_buffer_list *bl)
{
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
size_t ring_size;
void *ptr;
ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
ptr = (void *) __get_free_pages(gfp, get_order(ring_size));
if (!ptr)
return -ENOMEM;
bl->buf_ring = ptr;
bl->is_mapped = 1;
bl->is_mmap = 1;
return 0;
}
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_reg reg;
struct io_buffer_list *bl, *free_bl = NULL;
struct page **pages;
int nr_pages;
int ret;
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
if (reg.resv[0] || reg.resv[1] || reg.resv[2])
return -EINVAL;
if (!reg.ring_addr)
return -EFAULT;
if (reg.ring_addr & ~PAGE_MASK)
if (reg.flags & ~IOU_PBUF_RING_MMAP)
return -EINVAL;
if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
if (!reg.ring_addr)
return -EFAULT;
if (reg.ring_addr & ~PAGE_MASK)
return -EINVAL;
} else {
if (reg.ring_addr)
return -EINVAL;
}
if (!is_power_of_2(reg.ring_entries))
return -EINVAL;
@ -497,7 +580,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
bl = io_buffer_get_list(ctx, reg.bgid);
if (bl) {
/* if mapped buffer ring OR classic exists, don't allow */
if (bl->buf_nr_pages || !list_empty(&bl->buf_list))
if (bl->is_mapped || !list_empty(&bl->buf_list))
return -EEXIST;
} else {
free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
@ -505,22 +588,21 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return -ENOMEM;
}
pages = io_pin_pages(reg.ring_addr,
flex_array_size(br, bufs, reg.ring_entries),
&nr_pages);
if (IS_ERR(pages)) {
kfree(free_bl);
return PTR_ERR(pages);
if (!(reg.flags & IOU_PBUF_RING_MMAP))
ret = io_pin_pbuf_ring(&reg, bl);
else
ret = io_alloc_pbuf_ring(&reg, bl);
if (!ret) {
bl->nr_entries = reg.ring_entries;
bl->mask = reg.ring_entries - 1;
io_buffer_add_list(ctx, bl, reg.bgid);
return 0;
}
br = page_address(pages[0]);
bl->buf_pages = pages;
bl->buf_nr_pages = nr_pages;
bl->nr_entries = reg.ring_entries;
bl->buf_ring = br;
bl->mask = reg.ring_entries - 1;
io_buffer_add_list(ctx, bl, reg.bgid);
return 0;
kfree(free_bl);
return ret;
}
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
@ -530,13 +612,15 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2])
if (reg.resv[0] || reg.resv[1] || reg.resv[2])
return -EINVAL;
if (reg.flags)
return -EINVAL;
bl = io_buffer_get_list(ctx, reg.bgid);
if (!bl)
return -ENOENT;
if (!bl->buf_nr_pages)
if (!bl->is_mapped)
return -EINVAL;
__io_remove_buffers(ctx, bl, -1U);
@ -546,3 +630,14 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
}
return 0;
}
void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
{
struct io_buffer_list *bl;
bl = io_buffer_get_list(ctx, bgid);
if (!bl || !bl->is_mmap)
return NULL;
return bl->buf_ring;
}

View File

@ -23,6 +23,11 @@ struct io_buffer_list {
__u16 nr_entries;
__u16 head;
__u16 mask;
/* ring mapped provided buffers */
__u8 is_mapped;
/* ring mapped provided buffers, but mmap'ed by application */
__u8 is_mmap;
};
struct io_buffer {
@ -50,6 +55,8 @@ unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid);
static inline void io_kbuf_recycle_ring(struct io_kiocb *req)
{
/*

View File

@ -13,6 +13,11 @@
#include "filetable.h"
#include "msg_ring.h"
/* All valid masks for MSG_RING */
#define IORING_MSG_RING_MASK (IORING_MSG_RING_CQE_SKIP | \
IORING_MSG_RING_FLAGS_PASS)
struct io_msg {
struct file *file;
struct file *src_file;
@ -21,7 +26,10 @@ struct io_msg {
u32 len;
u32 cmd;
u32 src_fd;
u32 dst_fd;
union {
u32 dst_fd;
u32 cqe_flags;
};
u32 flags;
};
@ -91,6 +99,11 @@ static void io_msg_tw_complete(struct callback_head *head)
if (current->flags & PF_EXITING) {
ret = -EOWNERDEAD;
} else {
u32 flags = 0;
if (msg->flags & IORING_MSG_RING_FLAGS_PASS)
flags = msg->cqe_flags;
/*
* If the target ring is using IOPOLL mode, then we need to be
* holding the uring_lock for posting completions. Other ring
@ -99,7 +112,7 @@ static void io_msg_tw_complete(struct callback_head *head)
*/
if (target_ctx->flags & IORING_SETUP_IOPOLL)
mutex_lock(&target_ctx->uring_lock);
if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags))
ret = -EOVERFLOW;
if (target_ctx->flags & IORING_SETUP_IOPOLL)
mutex_unlock(&target_ctx->uring_lock);
@ -114,9 +127,12 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *target_ctx = req->file->private_data;
struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
u32 flags = 0;
int ret;
if (msg->src_fd || msg->dst_fd || msg->flags)
if (msg->src_fd || msg->flags & ~IORING_MSG_RING_FLAGS_PASS)
return -EINVAL;
if (!(msg->flags & IORING_MSG_RING_FLAGS_PASS) && msg->dst_fd)
return -EINVAL;
if (target_ctx->flags & IORING_SETUP_R_DISABLED)
return -EBADFD;
@ -124,15 +140,18 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
if (io_msg_need_remote(target_ctx))
return io_msg_exec_remote(req, io_msg_tw_complete);
if (msg->flags & IORING_MSG_RING_FLAGS_PASS)
flags = msg->cqe_flags;
ret = -EOVERFLOW;
if (target_ctx->flags & IORING_SETUP_IOPOLL) {
if (unlikely(io_double_lock_ctx(target_ctx, issue_flags)))
return -EAGAIN;
if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags))
ret = 0;
io_double_unlock_ctx(target_ctx);
} else {
if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags))
ret = 0;
}
return ret;
@ -143,14 +162,12 @@ static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_fl
struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
struct io_ring_ctx *ctx = req->ctx;
struct file *file = NULL;
unsigned long file_ptr;
int idx = msg->src_fd;
io_ring_submit_lock(ctx, issue_flags);
if (likely(idx < ctx->nr_user_files)) {
idx = array_index_nospec(idx, ctx->nr_user_files);
file_ptr = io_fixed_file_slot(&ctx->file_table, idx)->file_ptr;
file = (struct file *) (file_ptr & FFS_MASK);
file = io_file_from_index(&ctx->file_table, idx);
if (file)
get_file(file);
}
@ -243,7 +260,7 @@ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
msg->src_fd = READ_ONCE(sqe->addr3);
msg->dst_fd = READ_ONCE(sqe->file_index);
msg->flags = READ_ONCE(sqe->msg_ring_flags);
if (msg->flags & ~IORING_MSG_RING_CQE_SKIP)
if (msg->flags & ~IORING_MSG_RING_MASK)
return -EINVAL;
return 0;

View File

@ -92,6 +92,7 @@ int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
return -EINVAL;
shutdown->how = READ_ONCE(sqe->len);
req->flags |= REQ_F_FORCE_ASYNC;
return 0;
}
@ -101,8 +102,7 @@ int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
struct socket *sock;
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
sock = sock_from_file(req->file);
if (unlikely(!sock))
@ -183,10 +183,14 @@ static int io_setup_async_msg(struct io_kiocb *req,
memcpy(async_msg, kmsg, sizeof(*kmsg));
if (async_msg->msg.msg_name)
async_msg->msg.msg_name = &async_msg->addr;
if ((req->flags & REQ_F_BUFFER_SELECT) && !async_msg->msg.msg_iter.nr_segs)
return -EAGAIN;
/* if were using fast_iov, set it to the new one */
if (!kmsg->free_iov) {
size_t fast_idx = kmsg->msg.msg_iter.iov - kmsg->fast_iov;
async_msg->msg.msg_iter.iov = &async_msg->fast_iov[fast_idx];
if (iter_is_iovec(&kmsg->msg.msg_iter) && !kmsg->free_iov) {
size_t fast_idx = iter_iov(&kmsg->msg.msg_iter) - kmsg->fast_iov;
async_msg->msg.msg_iter.__iov = &async_msg->fast_iov[fast_idx];
}
return -EAGAIN;
@ -354,7 +358,6 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
struct sockaddr_storage __address;
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct msghdr msg;
struct iovec iov;
struct socket *sock;
unsigned flags;
int min_ret = 0;
@ -388,7 +391,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!sock))
return -ENOTSOCK;
ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
ret = import_ubuf(WRITE, sr->buf, sr->len, &msg.msg_iter);
if (unlikely(ret))
return ret;
@ -398,6 +401,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
if (flags & MSG_WAITALL)
min_ret = iov_iter_count(&msg.msg_iter);
flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
msg.msg_flags = flags;
ret = sock_sendmsg(sock, &msg);
if (ret < min_ret) {
@ -542,6 +546,7 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
struct io_async_msghdr *iomsg)
{
iomsg->msg.msg_name = &iomsg->addr;
iomsg->msg.msg_iter.nr_segs = 0;
#ifdef CONFIG_COMPAT
if (req->ctx->compat)
@ -625,9 +630,15 @@ static inline void io_recv_prep_retry(struct io_kiocb *req)
* again (for multishot).
*/
static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
unsigned int cflags, bool mshot_finished,
struct msghdr *msg, bool mshot_finished,
unsigned issue_flags)
{
unsigned int cflags;
cflags = io_put_kbuf(req, issue_flags);
if (msg->msg_inq && msg->msg_inq != -1)
cflags |= IORING_CQE_F_SOCK_NONEMPTY;
if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
io_req_set_res(req, *ret, cflags);
*ret = IOU_OK;
@ -635,10 +646,18 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
}
if (!mshot_finished) {
if (io_aux_cqe(req->ctx, issue_flags & IO_URING_F_COMPLETE_DEFER,
req->cqe.user_data, *ret, cflags | IORING_CQE_F_MORE, true)) {
if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
*ret, cflags | IORING_CQE_F_MORE)) {
io_recv_prep_retry(req);
return false;
/* Known not-empty or unknown state, retry */
if (cflags & IORING_CQE_F_SOCK_NONEMPTY ||
msg->msg_inq == -1)
return false;
if (issue_flags & IO_URING_F_MULTISHOT)
*ret = IOU_ISSUE_SKIP_COMPLETE;
else
*ret = -EAGAIN;
return true;
}
/* Otherwise stop multishot but use the current result. */
}
@ -741,7 +760,6 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct io_async_msghdr iomsg, *kmsg;
struct socket *sock;
unsigned int cflags;
unsigned flags;
int ret, min_ret = 0;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
@ -784,25 +802,26 @@ retry_multishot:
}
}
kmsg->fast_iov[0].iov_base = buf;
kmsg->fast_iov[0].iov_len = len;
iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1,
len);
iov_iter_ubuf(&kmsg->msg.msg_iter, READ, buf, len);
}
flags = sr->msg_flags;
if (force_nonblock)
flags |= MSG_DONTWAIT;
if (flags & MSG_WAITALL)
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
kmsg->msg.msg_get_inq = 1;
if (req->flags & REQ_F_APOLL_MULTISHOT)
kmsg->msg.msg_inq = -1;
if (req->flags & REQ_F_APOLL_MULTISHOT) {
ret = io_recvmsg_multishot(sock, sr, kmsg, flags,
&mshot_finished);
else
} else {
/* disable partial retry for recvmsg with cmsg attached */
if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen)
min_ret = iov_iter_count(&kmsg->msg.msg_iter);
ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg,
kmsg->uaddr, flags);
}
if (ret < min_ret) {
if (ret == -EAGAIN && force_nonblock) {
@ -832,11 +851,7 @@ retry_multishot:
else
io_kbuf_recycle(req, issue_flags);
cflags = io_put_kbuf(req, issue_flags);
if (kmsg->msg.msg_inq)
cflags |= IORING_CQE_F_SOCK_NONEMPTY;
if (!io_recv_finish(req, &ret, cflags, mshot_finished, issue_flags))
if (!io_recv_finish(req, &ret, &kmsg->msg, mshot_finished, issue_flags))
goto retry_multishot;
if (mshot_finished) {
@ -855,8 +870,6 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
struct msghdr msg;
struct socket *sock;
struct iovec iov;
unsigned int cflags;
unsigned flags;
int ret, min_ret = 0;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
@ -873,6 +886,14 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!sock))
return -ENOTSOCK;
msg.msg_name = NULL;
msg.msg_namelen = 0;
msg.msg_control = NULL;
msg.msg_get_inq = 1;
msg.msg_controllen = 0;
msg.msg_iocb = NULL;
msg.msg_ubuf = NULL;
retry_multishot:
if (io_do_buffer_select(req)) {
void __user *buf;
@ -883,18 +904,12 @@ retry_multishot:
sr->buf = buf;
}
ret = import_single_range(READ, sr->buf, len, &iov, &msg.msg_iter);
ret = import_ubuf(READ, sr->buf, len, &msg.msg_iter);
if (unlikely(ret))
goto out_free;
msg.msg_name = NULL;
msg.msg_namelen = 0;
msg.msg_control = NULL;
msg.msg_get_inq = 1;
msg.msg_inq = -1;
msg.msg_flags = 0;
msg.msg_controllen = 0;
msg.msg_iocb = NULL;
msg.msg_ubuf = NULL;
flags = sr->msg_flags;
if (force_nonblock)
@ -934,11 +949,7 @@ out_free:
else
io_kbuf_recycle(req, issue_flags);
cflags = io_put_kbuf(req, issue_flags);
if (msg.msg_inq)
cflags |= IORING_CQE_F_SOCK_NONEMPTY;
if (!io_recv_finish(req, &ret, cflags, ret <= 0, issue_flags))
if (!io_recv_finish(req, &ret, &msg, ret <= 0, issue_flags))
goto retry_multishot;
return ret;
@ -1094,7 +1105,6 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
struct sockaddr_storage __address;
struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg);
struct msghdr msg;
struct iovec iov;
struct socket *sock;
unsigned msg_flags;
int ret, min_ret = 0;
@ -1136,8 +1146,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
msg.sg_from_iter = io_sg_from_iter;
} else {
io_notif_set_extended(zc->notif);
ret = import_single_range(WRITE, zc->buf, zc->len, &iov,
&msg.msg_iter);
ret = import_ubuf(WRITE, zc->buf, zc->len, &msg.msg_iter);
if (unlikely(ret))
return ret;
ret = io_notif_account_mem(zc->notif, zc->len);
@ -1151,6 +1160,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
msg_flags |= MSG_DONTWAIT;
if (msg_flags & MSG_WAITALL)
min_ret = iov_iter_count(&msg.msg_iter);
msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
msg.msg_flags = msg_flags;
msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg;
@ -1312,7 +1322,6 @@ int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
int io_accept(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
@ -1362,8 +1371,8 @@ retry:
if (ret < 0)
return ret;
if (io_aux_cqe(ctx, issue_flags & IO_URING_F_COMPLETE_DEFER,
req->cqe.user_data, ret, IORING_CQE_F_MORE, true))
if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
ret, IORING_CQE_F_MORE))
goto retry;
return -ECANCELED;

View File

@ -5,8 +5,8 @@
#include "alloc_cache.h"
#if defined(CONFIG_NET)
struct io_async_msghdr {
#if defined(CONFIG_NET)
union {
struct iovec fast_iov[UIO_FASTIOV];
struct {
@ -22,8 +22,11 @@ struct io_async_msghdr {
struct sockaddr __user *uaddr;
struct msghdr msg;
struct sockaddr_storage addr;
#endif
};
#if defined(CONFIG_NET)
struct io_async_connect {
struct sockaddr_storage address;
};

View File

@ -9,7 +9,7 @@
#include "notif.h"
#include "rsrc.h"
static void io_notif_complete_tw_ext(struct io_kiocb *notif, bool *locked)
static void io_notif_complete_tw_ext(struct io_kiocb *notif, struct io_tw_state *ts)
{
struct io_notif_data *nd = io_notif_to_data(notif);
struct io_ring_ctx *ctx = notif->ctx;
@ -21,7 +21,7 @@ static void io_notif_complete_tw_ext(struct io_kiocb *notif, bool *locked)
__io_unaccount_mem(ctx->user, nd->account_pages);
nd->account_pages = 0;
}
io_req_task_complete(notif, locked);
io_req_task_complete(notif, ts);
}
static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg,
@ -31,7 +31,7 @@ static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg,
struct io_kiocb *notif = cmd_to_io_kiocb(nd);
if (refcount_dec_and_test(&uarg->refcnt))
io_req_task_work_add(notif);
__io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE);
}
static void io_tx_ubuf_callback_ext(struct sk_buff *skb, struct ubuf_info *uarg,
@ -68,9 +68,8 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
struct io_kiocb *notif;
struct io_notif_data *nd;
if (unlikely(!io_alloc_req_refill(ctx)))
if (unlikely(!io_alloc_req(ctx, &notif)))
return NULL;
notif = io_alloc_req(ctx);
notif->opcode = IORING_OP_NOP;
notif->flags = 0;
notif->file = NULL;
@ -80,7 +79,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
notif->io_task_work.func = io_req_task_complete;
nd = io_notif_to_data(notif);
nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
nd->uarg.flags = IO_NOTIF_UBUF_FLAGS;
nd->uarg.callback = io_tx_ubuf_callback;
refcount_set(&nd->uarg.refcnt, 1);
return notif;

View File

@ -7,6 +7,7 @@
#include "rsrc.h"
#define IO_NOTIF_UBUF_FLAGS (SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN)
#define IO_NOTIF_SPLICE_BATCH 32
struct io_notif_data {
@ -33,7 +34,7 @@ static inline void io_notif_flush(struct io_kiocb *notif)
/* drop slot's master ref */
if (refcount_dec_and_test(&nd->uarg.refcnt))
io_req_task_work_add(notif);
__io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE);
}
static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len)

View File

@ -46,11 +46,10 @@ static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb,
return -EOPNOTSUPP;
}
const struct io_op_def io_op_defs[] = {
const struct io_issue_def io_issue_defs[] = {
[IORING_OP_NOP] = {
.audit_skip = 1,
.iopoll = 1,
.name = "NOP",
.prep = io_nop_prep,
.issue = io_nop,
},
@ -64,13 +63,8 @@ const struct io_op_def io_op_defs[] = {
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw),
.name = "READV",
.prep = io_prep_rw,
.issue = io_read,
.prep_async = io_readv_prep_async,
.cleanup = io_readv_writev_cleanup,
.fail = io_rw_fail,
},
[IORING_OP_WRITEV] = {
.needs_file = 1,
@ -82,18 +76,12 @@ const struct io_op_def io_op_defs[] = {
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw),
.name = "WRITEV",
.prep = io_prep_rw,
.issue = io_write,
.prep_async = io_writev_prep_async,
.cleanup = io_readv_writev_cleanup,
.fail = io_rw_fail,
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
.audit_skip = 1,
.name = "FSYNC",
.prep = io_fsync_prep,
.issue = io_fsync,
},
@ -106,11 +94,8 @@ const struct io_op_def io_op_defs[] = {
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw),
.name = "READ_FIXED",
.prep = io_prep_rw,
.issue = io_read,
.fail = io_rw_fail,
},
[IORING_OP_WRITE_FIXED] = {
.needs_file = 1,
@ -122,30 +107,24 @@ const struct io_op_def io_op_defs[] = {
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw),
.name = "WRITE_FIXED",
.prep = io_prep_rw,
.issue = io_write,
.fail = io_rw_fail,
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.audit_skip = 1,
.name = "POLL_ADD",
.prep = io_poll_add_prep,
.issue = io_poll_add,
},
[IORING_OP_POLL_REMOVE] = {
.audit_skip = 1,
.name = "POLL_REMOVE",
.prep = io_poll_remove_prep,
.issue = io_poll_remove,
},
[IORING_OP_SYNC_FILE_RANGE] = {
.needs_file = 1,
.audit_skip = 1,
.name = "SYNC_FILE_RANGE",
.prep = io_sfr_prep,
.issue = io_sync_file_range,
},
@ -155,14 +134,9 @@ const struct io_op_def io_op_defs[] = {
.pollout = 1,
.ioprio = 1,
.manual_alloc = 1,
.name = "SENDMSG",
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.prep = io_sendmsg_prep,
.issue = io_sendmsg,
.prep_async = io_sendmsg_prep_async,
.cleanup = io_sendmsg_recvmsg_cleanup,
.fail = io_sendrecv_fail,
#else
.prep = io_eopnotsupp_prep,
#endif
@ -174,29 +148,21 @@ const struct io_op_def io_op_defs[] = {
.buffer_select = 1,
.ioprio = 1,
.manual_alloc = 1,
.name = "RECVMSG",
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.prep = io_recvmsg_prep,
.issue = io_recvmsg,
.prep_async = io_recvmsg_prep_async,
.cleanup = io_sendmsg_recvmsg_cleanup,
.fail = io_sendrecv_fail,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_TIMEOUT] = {
.audit_skip = 1,
.async_size = sizeof(struct io_timeout_data),
.name = "TIMEOUT",
.prep = io_timeout_prep,
.issue = io_timeout,
},
[IORING_OP_TIMEOUT_REMOVE] = {
/* used by timeout updates' prep() */
.audit_skip = 1,
.name = "TIMEOUT_REMOVE",
.prep = io_timeout_remove_prep,
.issue = io_timeout_remove,
},
@ -206,7 +172,6 @@ const struct io_op_def io_op_defs[] = {
.pollin = 1,
.poll_exclusive = 1,
.ioprio = 1, /* used for flags */
.name = "ACCEPT",
#if defined(CONFIG_NET)
.prep = io_accept_prep,
.issue = io_accept,
@ -216,14 +181,11 @@ const struct io_op_def io_op_defs[] = {
},
[IORING_OP_ASYNC_CANCEL] = {
.audit_skip = 1,
.name = "ASYNC_CANCEL",
.prep = io_async_cancel_prep,
.issue = io_async_cancel,
},
[IORING_OP_LINK_TIMEOUT] = {
.audit_skip = 1,
.async_size = sizeof(struct io_timeout_data),
.name = "LINK_TIMEOUT",
.prep = io_link_timeout_prep,
.issue = io_no_issue,
},
@ -231,46 +193,36 @@ const struct io_op_def io_op_defs[] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.name = "CONNECT",
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_connect),
.prep = io_connect_prep,
.issue = io_connect,
.prep_async = io_connect_prep_async,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
.name = "FALLOCATE",
.prep = io_fallocate_prep,
.issue = io_fallocate,
},
[IORING_OP_OPENAT] = {
.name = "OPENAT",
.prep = io_openat_prep,
.issue = io_openat,
.cleanup = io_open_cleanup,
},
[IORING_OP_CLOSE] = {
.name = "CLOSE",
.prep = io_close_prep,
.issue = io_close,
},
[IORING_OP_FILES_UPDATE] = {
.audit_skip = 1,
.iopoll = 1,
.name = "FILES_UPDATE",
.prep = io_files_update_prep,
.issue = io_files_update,
},
[IORING_OP_STATX] = {
.audit_skip = 1,
.name = "STATX",
.prep = io_statx_prep,
.issue = io_statx,
.cleanup = io_statx_cleanup,
},
[IORING_OP_READ] = {
.needs_file = 1,
@ -282,11 +234,8 @@ const struct io_op_def io_op_defs[] = {
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw),
.name = "READ",
.prep = io_prep_rw,
.issue = io_read,
.fail = io_rw_fail,
},
[IORING_OP_WRITE] = {
.needs_file = 1,
@ -298,22 +247,17 @@ const struct io_op_def io_op_defs[] = {
.ioprio = 1,
.iopoll = 1,
.iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw),
.name = "WRITE",
.prep = io_prep_rw,
.issue = io_write,
.fail = io_rw_fail,
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
.audit_skip = 1,
.name = "FADVISE",
.prep = io_fadvise_prep,
.issue = io_fadvise,
},
[IORING_OP_MADVISE] = {
.audit_skip = 1,
.name = "MADVISE",
.prep = io_madvise_prep,
.issue = io_madvise,
},
@ -324,13 +268,9 @@ const struct io_op_def io_op_defs[] = {
.audit_skip = 1,
.ioprio = 1,
.manual_alloc = 1,
.name = "SEND",
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.prep = io_sendmsg_prep,
.issue = io_send,
.fail = io_sendrecv_fail,
.prep_async = io_send_prep_async,
#else
.prep = io_eopnotsupp_prep,
#endif
@ -342,25 +282,20 @@ const struct io_op_def io_op_defs[] = {
.buffer_select = 1,
.audit_skip = 1,
.ioprio = 1,
.name = "RECV",
#if defined(CONFIG_NET)
.prep = io_recvmsg_prep,
.issue = io_recv,
.fail = io_sendrecv_fail,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_OPENAT2] = {
.name = "OPENAT2",
.prep = io_openat2_prep,
.issue = io_openat2,
.cleanup = io_open_cleanup,
},
[IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1,
.audit_skip = 1,
.name = "EPOLL",
#if defined(CONFIG_EPOLL)
.prep = io_epoll_ctl_prep,
.issue = io_epoll_ctl,
@ -373,21 +308,18 @@ const struct io_op_def io_op_defs[] = {
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.audit_skip = 1,
.name = "SPLICE",
.prep = io_splice_prep,
.issue = io_splice,
},
[IORING_OP_PROVIDE_BUFFERS] = {
.audit_skip = 1,
.iopoll = 1,
.name = "PROVIDE_BUFFERS",
.prep = io_provide_buffers_prep,
.issue = io_provide_buffers,
},
[IORING_OP_REMOVE_BUFFERS] = {
.audit_skip = 1,
.iopoll = 1,
.name = "REMOVE_BUFFERS",
.prep = io_remove_buffers_prep,
.issue = io_remove_buffers,
},
@ -396,13 +328,11 @@ const struct io_op_def io_op_defs[] = {
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.audit_skip = 1,
.name = "TEE",
.prep = io_tee_prep,
.issue = io_tee,
},
[IORING_OP_SHUTDOWN] = {
.needs_file = 1,
.name = "SHUTDOWN",
#if defined(CONFIG_NET)
.prep = io_shutdown_prep,
.issue = io_shutdown,
@ -411,72 +341,51 @@ const struct io_op_def io_op_defs[] = {
#endif
},
[IORING_OP_RENAMEAT] = {
.name = "RENAMEAT",
.prep = io_renameat_prep,
.issue = io_renameat,
.cleanup = io_renameat_cleanup,
},
[IORING_OP_UNLINKAT] = {
.name = "UNLINKAT",
.prep = io_unlinkat_prep,
.issue = io_unlinkat,
.cleanup = io_unlinkat_cleanup,
},
[IORING_OP_MKDIRAT] = {
.name = "MKDIRAT",
.prep = io_mkdirat_prep,
.issue = io_mkdirat,
.cleanup = io_mkdirat_cleanup,
},
[IORING_OP_SYMLINKAT] = {
.name = "SYMLINKAT",
.prep = io_symlinkat_prep,
.issue = io_symlinkat,
.cleanup = io_link_cleanup,
},
[IORING_OP_LINKAT] = {
.name = "LINKAT",
.prep = io_linkat_prep,
.issue = io_linkat,
.cleanup = io_link_cleanup,
},
[IORING_OP_MSG_RING] = {
.needs_file = 1,
.iopoll = 1,
.name = "MSG_RING",
.prep = io_msg_ring_prep,
.issue = io_msg_ring,
.cleanup = io_msg_ring_cleanup,
},
[IORING_OP_FSETXATTR] = {
.needs_file = 1,
.name = "FSETXATTR",
.prep = io_fsetxattr_prep,
.issue = io_fsetxattr,
.cleanup = io_xattr_cleanup,
},
[IORING_OP_SETXATTR] = {
.name = "SETXATTR",
.prep = io_setxattr_prep,
.issue = io_setxattr,
.cleanup = io_xattr_cleanup,
},
[IORING_OP_FGETXATTR] = {
.needs_file = 1,
.name = "FGETXATTR",
.prep = io_fgetxattr_prep,
.issue = io_fgetxattr,
.cleanup = io_xattr_cleanup,
},
[IORING_OP_GETXATTR] = {
.name = "GETXATTR",
.prep = io_getxattr_prep,
.issue = io_getxattr,
.cleanup = io_xattr_cleanup,
},
[IORING_OP_SOCKET] = {
.audit_skip = 1,
.name = "SOCKET",
#if defined(CONFIG_NET)
.prep = io_socket_prep,
.issue = io_socket,
@ -487,16 +396,12 @@ const struct io_op_def io_op_defs[] = {
[IORING_OP_URING_CMD] = {
.needs_file = 1,
.plug = 1,
.name = "URING_CMD",
.iopoll = 1,
.iopoll_queue = 1,
.async_size = uring_cmd_pdu_size(1),
.prep = io_uring_cmd_prep,
.issue = io_uring_cmd,
.prep_async = io_uring_cmd_prep_async,
},
[IORING_OP_SEND_ZC] = {
.name = "SEND_ZC",
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
@ -504,32 +409,243 @@ const struct io_op_def io_op_defs[] = {
.ioprio = 1,
.manual_alloc = 1,
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.prep = io_send_zc_prep,
.issue = io_send_zc,
.prep_async = io_send_prep_async,
.cleanup = io_send_zc_cleanup,
.fail = io_sendrecv_fail,
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_SENDMSG_ZC] = {
.name = "SENDMSG_ZC",
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.ioprio = 1,
.manual_alloc = 1,
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.prep = io_send_zc_prep,
.issue = io_sendmsg_zc,
#else
.prep = io_eopnotsupp_prep,
#endif
},
};
const struct io_cold_def io_cold_defs[] = {
[IORING_OP_NOP] = {
.name = "NOP",
},
[IORING_OP_READV] = {
.async_size = sizeof(struct io_async_rw),
.name = "READV",
.prep_async = io_readv_prep_async,
.cleanup = io_readv_writev_cleanup,
.fail = io_rw_fail,
},
[IORING_OP_WRITEV] = {
.async_size = sizeof(struct io_async_rw),
.name = "WRITEV",
.prep_async = io_writev_prep_async,
.cleanup = io_readv_writev_cleanup,
.fail = io_rw_fail,
},
[IORING_OP_FSYNC] = {
.name = "FSYNC",
},
[IORING_OP_READ_FIXED] = {
.async_size = sizeof(struct io_async_rw),
.name = "READ_FIXED",
.fail = io_rw_fail,
},
[IORING_OP_WRITE_FIXED] = {
.async_size = sizeof(struct io_async_rw),
.name = "WRITE_FIXED",
.fail = io_rw_fail,
},
[IORING_OP_POLL_ADD] = {
.name = "POLL_ADD",
},
[IORING_OP_POLL_REMOVE] = {
.name = "POLL_REMOVE",
},
[IORING_OP_SYNC_FILE_RANGE] = {
.name = "SYNC_FILE_RANGE",
},
[IORING_OP_SENDMSG] = {
.name = "SENDMSG",
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.prep_async = io_sendmsg_prep_async,
.cleanup = io_sendmsg_recvmsg_cleanup,
.fail = io_sendrecv_fail,
#endif
},
[IORING_OP_RECVMSG] = {
.name = "RECVMSG",
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.prep_async = io_recvmsg_prep_async,
.cleanup = io_sendmsg_recvmsg_cleanup,
.fail = io_sendrecv_fail,
#endif
},
[IORING_OP_TIMEOUT] = {
.async_size = sizeof(struct io_timeout_data),
.name = "TIMEOUT",
},
[IORING_OP_TIMEOUT_REMOVE] = {
.name = "TIMEOUT_REMOVE",
},
[IORING_OP_ACCEPT] = {
.name = "ACCEPT",
},
[IORING_OP_ASYNC_CANCEL] = {
.name = "ASYNC_CANCEL",
},
[IORING_OP_LINK_TIMEOUT] = {
.async_size = sizeof(struct io_timeout_data),
.name = "LINK_TIMEOUT",
},
[IORING_OP_CONNECT] = {
.name = "CONNECT",
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_connect),
.prep_async = io_connect_prep_async,
#endif
},
[IORING_OP_FALLOCATE] = {
.name = "FALLOCATE",
},
[IORING_OP_OPENAT] = {
.name = "OPENAT",
.cleanup = io_open_cleanup,
},
[IORING_OP_CLOSE] = {
.name = "CLOSE",
},
[IORING_OP_FILES_UPDATE] = {
.name = "FILES_UPDATE",
},
[IORING_OP_STATX] = {
.name = "STATX",
.cleanup = io_statx_cleanup,
},
[IORING_OP_READ] = {
.async_size = sizeof(struct io_async_rw),
.name = "READ",
.fail = io_rw_fail,
},
[IORING_OP_WRITE] = {
.async_size = sizeof(struct io_async_rw),
.name = "WRITE",
.fail = io_rw_fail,
},
[IORING_OP_FADVISE] = {
.name = "FADVISE",
},
[IORING_OP_MADVISE] = {
.name = "MADVISE",
},
[IORING_OP_SEND] = {
.name = "SEND",
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.fail = io_sendrecv_fail,
.prep_async = io_send_prep_async,
#endif
},
[IORING_OP_RECV] = {
.name = "RECV",
#if defined(CONFIG_NET)
.fail = io_sendrecv_fail,
#endif
},
[IORING_OP_OPENAT2] = {
.name = "OPENAT2",
.cleanup = io_open_cleanup,
},
[IORING_OP_EPOLL_CTL] = {
.name = "EPOLL",
},
[IORING_OP_SPLICE] = {
.name = "SPLICE",
},
[IORING_OP_PROVIDE_BUFFERS] = {
.name = "PROVIDE_BUFFERS",
},
[IORING_OP_REMOVE_BUFFERS] = {
.name = "REMOVE_BUFFERS",
},
[IORING_OP_TEE] = {
.name = "TEE",
},
[IORING_OP_SHUTDOWN] = {
.name = "SHUTDOWN",
},
[IORING_OP_RENAMEAT] = {
.name = "RENAMEAT",
.cleanup = io_renameat_cleanup,
},
[IORING_OP_UNLINKAT] = {
.name = "UNLINKAT",
.cleanup = io_unlinkat_cleanup,
},
[IORING_OP_MKDIRAT] = {
.name = "MKDIRAT",
.cleanup = io_mkdirat_cleanup,
},
[IORING_OP_SYMLINKAT] = {
.name = "SYMLINKAT",
.cleanup = io_link_cleanup,
},
[IORING_OP_LINKAT] = {
.name = "LINKAT",
.cleanup = io_link_cleanup,
},
[IORING_OP_MSG_RING] = {
.name = "MSG_RING",
.cleanup = io_msg_ring_cleanup,
},
[IORING_OP_FSETXATTR] = {
.name = "FSETXATTR",
.cleanup = io_xattr_cleanup,
},
[IORING_OP_SETXATTR] = {
.name = "SETXATTR",
.cleanup = io_xattr_cleanup,
},
[IORING_OP_FGETXATTR] = {
.name = "FGETXATTR",
.cleanup = io_xattr_cleanup,
},
[IORING_OP_GETXATTR] = {
.name = "GETXATTR",
.cleanup = io_xattr_cleanup,
},
[IORING_OP_SOCKET] = {
.name = "SOCKET",
},
[IORING_OP_URING_CMD] = {
.name = "URING_CMD",
.async_size = 2 * sizeof(struct io_uring_sqe),
.prep_async = io_uring_cmd_prep_async,
},
[IORING_OP_SEND_ZC] = {
.name = "SEND_ZC",
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.prep_async = io_send_prep_async,
.cleanup = io_send_zc_cleanup,
.fail = io_sendrecv_fail,
#endif
},
[IORING_OP_SENDMSG_ZC] = {
.name = "SENDMSG_ZC",
#if defined(CONFIG_NET)
.async_size = sizeof(struct io_async_msghdr),
.prep_async = io_sendmsg_prep_async,
.cleanup = io_send_zc_cleanup,
.fail = io_sendrecv_fail,
#else
.prep = io_eopnotsupp_prep,
#endif
},
};
@ -537,7 +653,7 @@ const struct io_op_def io_op_defs[] = {
const char *io_uring_get_opcode(u8 opcode)
{
if (opcode < IORING_OP_LAST)
return io_op_defs[opcode].name;
return io_cold_defs[opcode].name;
return "INVALID";
}
@ -545,12 +661,13 @@ void __init io_uring_optable_init(void)
{
int i;
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
BUILD_BUG_ON(ARRAY_SIZE(io_cold_defs) != IORING_OP_LAST);
BUILD_BUG_ON(ARRAY_SIZE(io_issue_defs) != IORING_OP_LAST);
for (i = 0; i < ARRAY_SIZE(io_op_defs); i++) {
BUG_ON(!io_op_defs[i].prep);
if (io_op_defs[i].prep != io_eopnotsupp_prep)
BUG_ON(!io_op_defs[i].issue);
WARN_ON_ONCE(!io_op_defs[i].name);
for (i = 0; i < ARRAY_SIZE(io_issue_defs); i++) {
BUG_ON(!io_issue_defs[i].prep);
if (io_issue_defs[i].prep != io_eopnotsupp_prep)
BUG_ON(!io_issue_defs[i].issue);
WARN_ON_ONCE(!io_cold_defs[i].name);
}
}

View File

@ -2,7 +2,7 @@
#ifndef IOU_OP_DEF_H
#define IOU_OP_DEF_H
struct io_op_def {
struct io_issue_def {
/* needs req->file assigned */
unsigned needs_file : 1;
/* should block plug */
@ -29,19 +29,24 @@ struct io_op_def {
unsigned iopoll_queue : 1;
/* opcode specific path will handle ->async_data allocation if needed */
unsigned manual_alloc : 1;
int (*issue)(struct io_kiocb *, unsigned int);
int (*prep)(struct io_kiocb *, const struct io_uring_sqe *);
};
struct io_cold_def {
/* size of async data needed, if any */
unsigned short async_size;
const char *name;
int (*prep)(struct io_kiocb *, const struct io_uring_sqe *);
int (*issue)(struct io_kiocb *, unsigned int);
int (*prep_async)(struct io_kiocb *);
void (*cleanup)(struct io_kiocb *);
void (*fail)(struct io_kiocb *);
};
extern const struct io_op_def io_op_defs[];
extern const struct io_issue_def io_issue_defs[];
extern const struct io_cold_def io_cold_defs[];
void io_uring_optable_init(void);
#endif

View File

@ -31,6 +31,17 @@ struct io_close {
u32 file_slot;
};
static bool io_openat_force_async(struct io_open *open)
{
/*
* Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
* it'll always -EAGAIN. Note that we test for __O_TMPFILE because
* O_TMPFILE includes O_DIRECTORY, which isn't a flag we need to force
* async for.
*/
return open->how.flags & (O_TRUNC | O_CREAT | __O_TMPFILE);
}
static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_open *open = io_kiocb_to_cmd(req, struct io_open);
@ -61,6 +72,8 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
open->nofile = rlimit(RLIMIT_NOFILE);
req->flags |= REQ_F_NEED_CLEANUP;
if (io_openat_force_async(open))
req->flags |= REQ_F_FORCE_ASYNC;
return 0;
}
@ -108,12 +121,7 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
nonblock_set = op.open_flag & O_NONBLOCK;
resolve_nonblock = open->how.resolve & RESOLVE_CACHED;
if (issue_flags & IO_URING_F_NONBLOCK) {
/*
* Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open,
* it'll always -EAGAIN
*/
if (open->how.flags & (O_TRUNC | O_CREAT | O_TMPFILE))
return -EAGAIN;
WARN_ON_ONCE(io_openat_force_async(open));
op.lookup_flags |= LOOKUP_CACHED;
op.open_flag |= O_NONBLOCK;
}
@ -144,7 +152,6 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags)
if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set)
file->f_flags &= ~O_NONBLOCK;
fsnotify_open(file);
if (!fixed)
fd_install(ret, file);

View File

@ -51,6 +51,9 @@ struct io_poll_table {
#define IO_WQE_F_DOUBLE 1
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
void *key);
static inline struct io_kiocb *wqe_to_req(struct wait_queue_entry *wqe)
{
unsigned long priv = (unsigned long)wqe->private;
@ -145,7 +148,7 @@ static void io_poll_req_insert_locked(struct io_kiocb *req)
hlist_add_head(&req->hash_node, &table->hbs[index].list);
}
static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked)
static void io_poll_tw_hash_eject(struct io_kiocb *req, struct io_tw_state *ts)
{
struct io_ring_ctx *ctx = req->ctx;
@ -156,7 +159,7 @@ static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked)
* already grabbed the mutex for us, but there is a chance it
* failed.
*/
io_tw_lock(ctx, locked);
io_tw_lock(ctx, ts);
hash_del(&req->hash_node);
req->flags &= ~REQ_F_HASH_LOCKED;
} else {
@ -164,15 +167,14 @@ static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked)
}
}
static void io_init_poll_iocb(struct io_poll *poll, __poll_t events,
wait_queue_func_t wake_func)
static void io_init_poll_iocb(struct io_poll *poll, __poll_t events)
{
poll->head = NULL;
#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
/* mask in events that we always want/need */
poll->events = events | IO_POLL_UNMASK;
INIT_LIST_HEAD(&poll->wait.entry);
init_waitqueue_func_entry(&poll->wait, wake_func);
init_waitqueue_func_entry(&poll->wait, io_poll_wake);
}
static inline void io_poll_remove_entry(struct io_poll *poll)
@ -236,7 +238,7 @@ enum {
* req->cqe.res. IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot
* poll and that the result is stored in req->cqe.
*/
static int io_poll_check_events(struct io_kiocb *req, bool *locked)
static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts)
{
int v;
@ -298,13 +300,13 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
__poll_t mask = mangle_poll(req->cqe.res &
req->apoll_events);
if (!io_aux_cqe(req->ctx, *locked, req->cqe.user_data,
mask, IORING_CQE_F_MORE, false)) {
if (!io_fill_cqe_req_aux(req, ts->locked, mask,
IORING_CQE_F_MORE)) {
io_req_set_res(req, mask, 0);
return IOU_POLL_REMOVE_POLL_USE_RES;
}
} else {
int ret = io_poll_issue(req, locked);
int ret = io_poll_issue(req, ts);
if (ret == IOU_STOP_MULTISHOT)
return IOU_POLL_REMOVE_POLL_USE_RES;
if (ret < 0)
@ -324,15 +326,15 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
return IOU_POLL_NO_ACTION;
}
static void io_poll_task_func(struct io_kiocb *req, bool *locked)
void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts)
{
int ret;
ret = io_poll_check_events(req, locked);
ret = io_poll_check_events(req, ts);
if (ret == IOU_POLL_NO_ACTION)
return;
io_poll_remove_entries(req);
io_poll_tw_hash_eject(req, locked);
io_poll_tw_hash_eject(req, ts);
if (req->opcode == IORING_OP_POLL_ADD) {
if (ret == IOU_POLL_DONE) {
@ -341,7 +343,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
poll = io_kiocb_to_cmd(req, struct io_poll);
req->cqe.res = mangle_poll(req->cqe.res & poll->events);
} else if (ret == IOU_POLL_REISSUE) {
io_req_task_submit(req, locked);
io_req_task_submit(req, ts);
return;
} else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) {
req->cqe.res = ret;
@ -349,14 +351,14 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
}
io_req_set_res(req, req->cqe.res, 0);
io_req_task_complete(req, locked);
io_req_task_complete(req, ts);
} else {
io_tw_lock(req->ctx, locked);
io_tw_lock(req->ctx, ts);
if (ret == IOU_POLL_REMOVE_POLL_USE_RES)
io_req_task_complete(req, locked);
io_req_task_complete(req, ts);
else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE)
io_req_task_submit(req, locked);
io_req_task_submit(req, ts);
else
io_req_defer_failed(req, ret);
}
@ -508,7 +510,7 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt,
/* mark as double wq entry */
wqe_private |= IO_WQE_F_DOUBLE;
io_init_poll_iocb(poll, first->events, first->wait.func);
io_init_poll_iocb(poll, first->events);
if (!io_poll_double_prepare(req)) {
/* the request is completing, just back off */
kfree(poll);
@ -569,7 +571,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
INIT_HLIST_NODE(&req->hash_node);
req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
io_init_poll_iocb(poll, mask, io_poll_wake);
io_init_poll_iocb(poll, mask);
poll->file = req->file;
req->apoll_events = poll->events;
@ -690,7 +692,7 @@ alloc_apoll:
int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
const struct io_issue_def *def = &io_issue_defs[req->opcode];
struct async_poll *apoll;
struct io_poll_table ipt;
__poll_t mask = POLLPRI | POLLERR | EPOLLET;
@ -822,14 +824,10 @@ static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
spin_lock(&hb->lock);
hlist_for_each_entry(req, &hb->list, hash_node) {
if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
req->file != cd->file)
continue;
if (cd->seq == req->work.cancel_seq)
continue;
req->work.cancel_seq = cd->seq;
*out_bucket = hb;
return req;
if (io_cancel_req_match(req, cd)) {
*out_bucket = hb;
return req;
}
}
spin_unlock(&hb->lock);
}
@ -853,7 +851,8 @@ static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
struct io_hash_bucket *bucket;
struct io_kiocb *req;
if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY))
if (cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP |
IORING_ASYNC_CANCEL_ANY))
req = io_poll_file_find(ctx, cd, table, &bucket);
else
req = io_poll_find(ctx, false, cd, table, &bucket);
@ -970,12 +969,12 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags)
int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_poll_update *poll_update = io_kiocb_to_cmd(req, struct io_poll_update);
struct io_cancel_data cd = { .data = poll_update->old_user_data, };
struct io_ring_ctx *ctx = req->ctx;
struct io_cancel_data cd = { .ctx = ctx, .data = poll_update->old_user_data, };
struct io_hash_bucket *bucket;
struct io_kiocb *preq;
int ret2, ret = 0;
bool locked = true;
struct io_tw_state ts = { .locked = true };
io_ring_submit_lock(ctx, issue_flags);
preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket);
@ -1024,7 +1023,7 @@ found:
req_set_fail(preq);
io_req_set_res(preq, -ECANCELED, 0);
io_req_task_complete(preq, &locked);
io_req_task_complete(preq, &ts);
out:
io_ring_submit_unlock(ctx, issue_flags);
if (ret < 0) {

View File

@ -38,3 +38,5 @@ bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
bool cancel_all);
void io_apoll_cache_free(struct io_cache_entry *entry);
void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts);

View File

@ -23,24 +23,21 @@ struct io_rsrc_update {
u32 offset;
};
static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
struct io_mapped_ubuf **pimu,
struct page **last_hpage);
#define IO_RSRC_REF_BATCH 100
/* only define max */
#define IORING_MAX_FIXED_FILES (1U << 20)
#define IORING_MAX_REG_BUFFERS (1U << 14)
void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
if (ctx->rsrc_cached_refs) {
io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs);
ctx->rsrc_cached_refs = 0;
}
}
static const struct io_mapped_ubuf dummy_ubuf = {
/* set invalid range, so io_import_fixed() fails meeting it */
.ubuf = -1UL,
.ubuf_end = 0,
};
int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
{
@ -141,7 +138,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
struct io_mapped_ubuf *imu = *slot;
unsigned int i;
if (imu != ctx->dummy_ubuf) {
if (imu != &dummy_ubuf) {
for (i = 0; i < imu->nr_bvecs; i++)
unpin_user_page(imu->bvec[i].bv_page);
if (imu->acct_pages)
@ -151,216 +148,129 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
*slot = NULL;
}
void io_rsrc_refs_refill(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
static void io_rsrc_put_work(struct io_rsrc_node *node)
{
ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH;
percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH);
}
struct io_rsrc_put *prsrc = &node->item;
static void __io_rsrc_put_work(struct io_rsrc_node *ref_node)
{
struct io_rsrc_data *rsrc_data = ref_node->rsrc_data;
struct io_ring_ctx *ctx = rsrc_data->ctx;
struct io_rsrc_put *prsrc, *tmp;
if (prsrc->tag)
io_post_aux_cqe(node->ctx, prsrc->tag, 0, 0);
list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) {
list_del(&prsrc->list);
if (prsrc->tag) {
if (ctx->flags & IORING_SETUP_IOPOLL) {
mutex_lock(&ctx->uring_lock);
io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
mutex_unlock(&ctx->uring_lock);
} else {
io_post_aux_cqe(ctx, prsrc->tag, 0, 0);
}
}
rsrc_data->do_put(ctx, prsrc);
kfree(prsrc);
}
io_rsrc_node_destroy(ref_node);
if (atomic_dec_and_test(&rsrc_data->refs))
complete(&rsrc_data->done);
}
void io_rsrc_put_work(struct work_struct *work)
{
struct io_ring_ctx *ctx;
struct llist_node *node;
ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work);
node = llist_del_all(&ctx->rsrc_put_llist);
while (node) {
struct io_rsrc_node *ref_node;
struct llist_node *next = node->next;
ref_node = llist_entry(node, struct io_rsrc_node, llist);
__io_rsrc_put_work(ref_node);
node = next;
switch (node->type) {
case IORING_RSRC_FILE:
io_rsrc_file_put(node->ctx, prsrc);
break;
case IORING_RSRC_BUFFER:
io_rsrc_buf_put(node->ctx, prsrc);
break;
default:
WARN_ON_ONCE(1);
break;
}
}
void io_rsrc_put_tw(struct callback_head *cb)
void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
{
struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx,
rsrc_put_tw);
io_rsrc_put_work(&ctx->rsrc_put_work.work);
if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache))
kfree(node);
}
void io_wait_rsrc_data(struct io_rsrc_data *data)
void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
__must_hold(&node->ctx->uring_lock)
{
if (data && !atomic_dec_and_test(&data->refs))
wait_for_completion(&data->done);
}
void io_rsrc_node_destroy(struct io_rsrc_node *ref_node)
{
percpu_ref_exit(&ref_node->refs);
kfree(ref_node);
}
static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
{
struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs);
struct io_ring_ctx *ctx = node->rsrc_data->ctx;
unsigned long flags;
bool first_add = false;
unsigned long delay = HZ;
spin_lock_irqsave(&ctx->rsrc_ref_lock, flags);
node->done = true;
/* if we are mid-quiesce then do not delay */
if (node->rsrc_data->quiesce)
delay = 0;
struct io_ring_ctx *ctx = node->ctx;
while (!list_empty(&ctx->rsrc_ref_list)) {
node = list_first_entry(&ctx->rsrc_ref_list,
struct io_rsrc_node, node);
/* recycle ref nodes in order */
if (!node->done)
if (node->refs)
break;
list_del(&node->node);
first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist);
}
spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
if (!first_add)
return;
if (ctx->submitter_task) {
if (!task_work_add(ctx->submitter_task, &ctx->rsrc_put_tw,
ctx->notify_method))
return;
if (likely(!node->empty))
io_rsrc_put_work(node);
io_rsrc_node_destroy(ctx, node);
}
mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce))
wake_up_all(&ctx->rsrc_quiesce_wq);
}
static struct io_rsrc_node *io_rsrc_node_alloc(void)
struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
{
struct io_rsrc_node *ref_node;
struct io_cache_entry *entry;
ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
if (!ref_node)
return NULL;
if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero,
0, GFP_KERNEL)) {
kfree(ref_node);
return NULL;
entry = io_alloc_cache_get(&ctx->rsrc_node_cache);
if (entry) {
ref_node = container_of(entry, struct io_rsrc_node, cache);
} else {
ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
if (!ref_node)
return NULL;
}
INIT_LIST_HEAD(&ref_node->node);
INIT_LIST_HEAD(&ref_node->rsrc_list);
ref_node->done = false;
ref_node->ctx = ctx;
ref_node->empty = 0;
ref_node->refs = 1;
return ref_node;
}
void io_rsrc_node_switch(struct io_ring_ctx *ctx,
struct io_rsrc_data *data_to_kill)
__must_hold(&ctx->uring_lock)
{
WARN_ON_ONCE(!ctx->rsrc_backup_node);
WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node);
io_rsrc_refs_drop(ctx);
if (data_to_kill) {
struct io_rsrc_node *rsrc_node = ctx->rsrc_node;
rsrc_node->rsrc_data = data_to_kill;
spin_lock_irq(&ctx->rsrc_ref_lock);
list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list);
spin_unlock_irq(&ctx->rsrc_ref_lock);
atomic_inc(&data_to_kill->refs);
percpu_ref_kill(&rsrc_node->refs);
ctx->rsrc_node = NULL;
}
if (!ctx->rsrc_node) {
ctx->rsrc_node = ctx->rsrc_backup_node;
ctx->rsrc_backup_node = NULL;
}
}
int io_rsrc_node_switch_start(struct io_ring_ctx *ctx)
{
if (ctx->rsrc_backup_node)
return 0;
ctx->rsrc_backup_node = io_rsrc_node_alloc();
return ctx->rsrc_backup_node ? 0 : -ENOMEM;
}
__cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
struct io_ring_ctx *ctx)
{
struct io_rsrc_node *backup;
DEFINE_WAIT(we);
int ret;
/* As we may drop ->uring_lock, other task may have started quiesce */
/* As We may drop ->uring_lock, other task may have started quiesce */
if (data->quiesce)
return -ENXIO;
ret = io_rsrc_node_switch_start(ctx);
if (ret)
return ret;
io_rsrc_node_switch(ctx, data);
/* kill initial ref, already quiesced if zero */
if (atomic_dec_and_test(&data->refs))
backup = io_rsrc_node_alloc(ctx);
if (!backup)
return -ENOMEM;
ctx->rsrc_node->empty = true;
ctx->rsrc_node->type = -1;
list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list);
io_put_rsrc_node(ctx, ctx->rsrc_node);
ctx->rsrc_node = backup;
if (list_empty(&ctx->rsrc_ref_list))
return 0;
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
atomic_set(&ctx->cq_wait_nr, 1);
smp_mb();
}
ctx->rsrc_quiesce++;
data->quiesce = true;
mutex_unlock(&ctx->uring_lock);
do {
prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE);
mutex_unlock(&ctx->uring_lock);
ret = io_run_task_work_sig(ctx);
if (ret < 0) {
atomic_inc(&data->refs);
/* wait for all works potentially completing data->done */
flush_delayed_work(&ctx->rsrc_put_work);
reinit_completion(&data->done);
mutex_lock(&ctx->uring_lock);
if (list_empty(&ctx->rsrc_ref_list))
ret = 0;
break;
}
flush_delayed_work(&ctx->rsrc_put_work);
ret = wait_for_completion_interruptible(&data->done);
if (!ret) {
mutex_lock(&ctx->uring_lock);
if (atomic_read(&data->refs) <= 0)
break;
/*
* it has been revived by another thread while
* we were unlocked
*/
mutex_unlock(&ctx->uring_lock);
}
} while (1);
data->quiesce = false;
schedule();
__set_current_state(TASK_RUNNING);
mutex_lock(&ctx->uring_lock);
ret = 0;
} while (!list_empty(&ctx->rsrc_ref_list));
finish_wait(&ctx->rsrc_quiesce_wq, &we);
data->quiesce = false;
ctx->rsrc_quiesce--;
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
atomic_set(&ctx->cq_wait_nr, 0);
smp_mb();
}
return ret;
}
@ -405,12 +315,12 @@ static __cold void **io_alloc_page_table(size_t size)
return table;
}
__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx,
rsrc_put_fn *do_put, u64 __user *utags,
__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type,
u64 __user *utags,
unsigned nr, struct io_rsrc_data **pdata)
{
struct io_rsrc_data *data;
int ret = -ENOMEM;
int ret = 0;
unsigned i;
data = kzalloc(sizeof(*data), GFP_KERNEL);
@ -424,7 +334,7 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx,
data->nr = nr;
data->ctx = ctx;
data->do_put = do_put;
data->rsrc_type = type;
if (utags) {
ret = -EFAULT;
for (i = 0; i < nr; i++) {
@ -435,9 +345,6 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx,
goto fail;
}
}
atomic_set(&data->refs, 1);
init_completion(&data->done);
*pdata = data;
return 0;
fail:
@ -453,10 +360,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
__s32 __user *fds = u64_to_user_ptr(up->data);
struct io_rsrc_data *data = ctx->file_data;
struct io_fixed_file *file_slot;
struct file *file;
int fd, i, err = 0;
unsigned int done;
bool needs_switch = false;
if (!ctx->file_data)
return -ENXIO;
@ -482,16 +387,16 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
file_slot = io_fixed_file_slot(&ctx->file_table, i);
if (file_slot->file_ptr) {
file = (struct file *)(file_slot->file_ptr & FFS_MASK);
err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file);
err = io_queue_rsrc_removal(data, i,
io_slot_file(file_slot));
if (err)
break;
file_slot->file_ptr = 0;
io_file_bitmap_clear(&ctx->file_table, i);
needs_switch = true;
}
if (fd != -1) {
file = fget(fd);
struct file *file = fget(fd);
if (!file) {
err = -EBADF;
break;
@ -519,9 +424,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
io_file_bitmap_set(&ctx->file_table, i);
}
}
if (needs_switch)
io_rsrc_node_switch(ctx, data);
return done ? done : err;
}
@ -532,7 +434,6 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
u64 __user *tags = u64_to_user_ptr(up->tags);
struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
struct page *last_hpage = NULL;
bool needs_switch = false;
__u32 done;
int i, err;
@ -543,7 +444,6 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
for (done = 0; done < nr_args; done++) {
struct io_mapped_ubuf *imu;
int offset = up->offset + done;
u64 tag = 0;
err = io_copy_iov(ctx, &iov, iovs, done);
@ -564,24 +464,20 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
if (err)
break;
i = array_index_nospec(offset, ctx->nr_user_bufs);
if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
i = array_index_nospec(up->offset + done, ctx->nr_user_bufs);
if (ctx->user_bufs[i] != &dummy_ubuf) {
err = io_queue_rsrc_removal(ctx->buf_data, i,
ctx->rsrc_node, ctx->user_bufs[i]);
ctx->user_bufs[i]);
if (unlikely(err)) {
io_buffer_unmap(ctx, &imu);
break;
}
ctx->user_bufs[i] = ctx->dummy_ubuf;
needs_switch = true;
ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf;
}
ctx->user_bufs[i] = imu;
*io_get_tag_slot(ctx->buf_data, i) = tag;
}
if (needs_switch)
io_rsrc_node_switch(ctx, ctx->buf_data);
return done ? done : err;
}
@ -590,13 +486,11 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
unsigned nr_args)
{
__u32 tmp;
int err;
lockdep_assert_held(&ctx->uring_lock);
if (check_add_overflow(up->offset, nr_args, &tmp))
return -EOVERFLOW;
err = io_rsrc_node_switch_start(ctx);
if (err)
return err;
switch (type) {
case IORING_RSRC_FILE:
@ -753,20 +647,24 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
return IOU_OK;
}
int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
struct io_rsrc_node *node, void *rsrc)
int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc)
{
struct io_ring_ctx *ctx = data->ctx;
struct io_rsrc_node *node = ctx->rsrc_node;
u64 *tag_slot = io_get_tag_slot(data, idx);
struct io_rsrc_put *prsrc;
prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL);
if (!prsrc)
ctx->rsrc_node = io_rsrc_node_alloc(ctx);
if (unlikely(!ctx->rsrc_node)) {
ctx->rsrc_node = node;
return -ENOMEM;
}
prsrc->tag = *tag_slot;
node->item.rsrc = rsrc;
node->type = data->rsrc_type;
node->item.tag = *tag_slot;
*tag_slot = 0;
prsrc->rsrc = rsrc;
list_add(&prsrc->list, &node->rsrc_list);
list_add_tail(&node->node, &ctx->rsrc_ref_list);
io_put_rsrc_node(ctx, node);
return 0;
}
@ -881,20 +779,14 @@ int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file)
return 0;
}
static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
static __cold void io_rsrc_file_scm_put(struct io_ring_ctx *ctx, struct file *file)
{
struct file *file = prsrc->file;
#if defined(CONFIG_UNIX)
struct sock *sock = ctx->ring_sock->sk;
struct sk_buff_head list, *head = &sock->sk_receive_queue;
struct sk_buff *skb;
int i;
if (!io_file_need_scm(file)) {
fput(file);
return;
}
__skb_queue_head_init(&list);
/*
@ -944,11 +836,19 @@ static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
__skb_queue_tail(head, skb);
spin_unlock_irq(&head->lock);
}
#else
fput(file);
#endif
}
static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
{
struct file *file = prsrc->file;
if (likely(!io_file_need_scm(file)))
fput(file);
else
io_rsrc_file_scm_put(ctx, file);
}
int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args, u64 __user *tags)
{
@ -965,10 +865,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return -EMFILE;
if (nr_args > rlimit(RLIMIT_NOFILE))
return -EMFILE;
ret = io_rsrc_node_switch_start(ctx);
if (ret)
return ret;
ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args,
ret = io_rsrc_data_alloc(ctx, IORING_RSRC_FILE, tags, nr_args,
&ctx->file_data);
if (ret)
return ret;
@ -1022,7 +919,6 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
/* default it to the whole table */
io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files);
io_rsrc_node_switch(ctx, NULL);
return 0;
fail:
__io_sqe_files_unregister(ctx);
@ -1207,8 +1103,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
unsigned long off;
size_t size;
int ret, nr_pages, i;
struct folio *folio = NULL;
*pimu = ctx->dummy_ubuf;
*pimu = (struct io_mapped_ubuf *)&dummy_ubuf;
if (!iov->iov_base)
return 0;
@ -1221,6 +1118,32 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
goto done;
}
/* If it's a huge page, try to coalesce them into a single bvec entry */
if (nr_pages > 1) {
folio = page_folio(pages[0]);
for (i = 1; i < nr_pages; i++) {
/*
* Pages must be consecutive and on the same folio for
* this to work
*/
if (page_folio(pages[i]) != folio ||
pages[i] != pages[i - 1] + 1) {
folio = NULL;
break;
}
}
if (folio) {
/*
* The pages are bound to the folio, it doesn't
* actually unpin them but drops all but one reference,
* which is usually put down by io_buffer_unmap().
* Note, needs a better helper.
*/
unpin_user_pages(&pages[1], nr_pages - 1);
nr_pages = 1;
}
}
imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
if (!imu)
goto done;
@ -1233,22 +1156,25 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
off = (unsigned long) iov->iov_base & ~PAGE_MASK;
size = iov->iov_len;
for (i = 0; i < nr_pages; i++) {
size_t vec_len;
vec_len = min_t(size_t, size, PAGE_SIZE - off);
imu->bvec[i].bv_page = pages[i];
imu->bvec[i].bv_len = vec_len;
imu->bvec[i].bv_offset = off;
off = 0;
size -= vec_len;
}
/* store original address for later verification */
imu->ubuf = (unsigned long) iov->iov_base;
imu->ubuf_end = imu->ubuf + iov->iov_len;
imu->nr_bvecs = nr_pages;
*pimu = imu;
ret = 0;
if (folio) {
bvec_set_page(&imu->bvec[0], pages[0], size, off);
goto done;
}
for (i = 0; i < nr_pages; i++) {
size_t vec_len;
vec_len = min_t(size_t, size, PAGE_SIZE - off);
bvec_set_page(&imu->bvec[i], pages[i], vec_len, off);
off = 0;
size -= vec_len;
}
done:
if (ret)
kvfree(imu);
@ -1276,10 +1202,7 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
return -EBUSY;
if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
return -EINVAL;
ret = io_rsrc_node_switch_start(ctx);
if (ret)
return ret;
ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data);
ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, tags, nr_args, &data);
if (ret)
return ret;
ret = io_buffers_map_alloc(ctx, nr_args);
@ -1316,8 +1239,6 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
ctx->buf_data = data;
if (ret)
__io_sqe_buffers_unregister(ctx);
else
io_rsrc_node_switch(ctx, NULL);
return ret;
}
@ -1337,7 +1258,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
return -EFAULT;
/*
* May not be a start of buffer, set size appropriately
* Might not be a start of buffer, set size appropriately
* and advance us to the beginning.
*/
offset = buf_addr - imu->ubuf;
@ -1363,7 +1284,15 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
const struct bio_vec *bvec = imu->bvec;
if (offset <= bvec->bv_len) {
iov_iter_advance(iter, offset);
/*
* Note, huge pages buffers consists of one large
* bvec entry and should always go this way. The other
* branch doesn't expect non PAGE_SIZE'd chunks.
*/
iter->bvec = bvec;
iter->nr_segs = bvec->bv_len;
iter->count -= offset;
iter->iov_offset = offset;
} else {
unsigned long seg_skip;

View File

@ -4,6 +4,10 @@
#include <net/af_unix.h>
#include "alloc_cache.h"
#define IO_NODE_ALLOC_CACHE_MAX 32
#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3)
#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT)
#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1)
@ -14,7 +18,6 @@ enum {
};
struct io_rsrc_put {
struct list_head list;
u64 tag;
union {
void *rsrc;
@ -30,19 +33,20 @@ struct io_rsrc_data {
u64 **tags;
unsigned int nr;
rsrc_put_fn *do_put;
atomic_t refs;
struct completion done;
u16 rsrc_type;
bool quiesce;
};
struct io_rsrc_node {
struct percpu_ref refs;
union {
struct io_cache_entry cache;
struct io_ring_ctx *ctx;
};
int refs;
bool empty;
u16 type;
struct list_head node;
struct list_head rsrc_list;
struct io_rsrc_data *rsrc_data;
struct llist_node llist;
bool done;
struct io_rsrc_put item;
};
struct io_mapped_ubuf {
@ -53,17 +57,10 @@ struct io_mapped_ubuf {
struct bio_vec bvec[];
};
void io_rsrc_put_tw(struct callback_head *cb);
void io_rsrc_put_work(struct work_struct *work);
void io_rsrc_refs_refill(struct io_ring_ctx *ctx);
void io_wait_rsrc_data(struct io_rsrc_data *data);
void io_rsrc_node_destroy(struct io_rsrc_node *ref_node);
void io_rsrc_refs_drop(struct io_ring_ctx *ctx);
int io_rsrc_node_switch_start(struct io_ring_ctx *ctx);
int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
struct io_rsrc_node *node, void *rsrc);
void io_rsrc_node_switch(struct io_ring_ctx *ctx,
struct io_rsrc_data *data_to_kill);
void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc);
int io_import_fixed(int ddir, struct iov_iter *iter,
struct io_mapped_ubuf *imu,
@ -107,36 +104,24 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
unsigned int size, unsigned int type);
static inline void io_rsrc_put_node(struct io_rsrc_node *node, int nr)
static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
{
percpu_ref_put_many(&node->refs, nr);
}
lockdep_assert_held(&ctx->uring_lock);
static inline void io_req_put_rsrc(struct io_kiocb *req)
{
if (req->rsrc_node)
io_rsrc_put_node(req->rsrc_node, 1);
if (node && !--node->refs)
io_rsrc_node_ref_zero(node);
}
static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
struct io_rsrc_node *node = req->rsrc_node;
if (node) {
if (node == ctx->rsrc_node)
ctx->rsrc_cached_refs++;
else
io_rsrc_put_node(node, 1);
}
io_put_rsrc_node(ctx, req->rsrc_node);
}
static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx)
static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx,
struct io_rsrc_node *node)
{
ctx->rsrc_cached_refs--;
if (unlikely(ctx->rsrc_cached_refs < 0))
io_rsrc_refs_refill(ctx);
node->refs++;
}
static inline void io_req_set_rsrc_node(struct io_kiocb *req,
@ -149,7 +134,7 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req,
lockdep_assert_held(&ctx->uring_lock);
req->rsrc_node = ctx->rsrc_node;
io_charge_rsrc_node(ctx);
io_charge_rsrc_node(ctx, ctx->rsrc_node);
io_ring_submit_unlock(ctx, issue_flags);
}
}
@ -162,6 +147,12 @@ static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
return &data->tags[table_idx][off];
}
static inline int io_rsrc_init(struct io_ring_ctx *ctx)
{
ctx->rsrc_node = io_rsrc_node_alloc(ctx);
return ctx->rsrc_node ? 0 : -ENOMEM;
}
int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);

View File

@ -105,6 +105,7 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe)
} else {
rw->kiocb.ki_ioprio = get_current_ioprio();
}
rw->kiocb.dio_complete = NULL;
rw->addr = READ_ONCE(sqe->addr);
rw->len = READ_ONCE(sqe->len);
@ -283,16 +284,25 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
return res;
}
static void io_req_rw_complete(struct io_kiocb *req, bool *locked)
void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts)
{
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
struct kiocb *kiocb = &rw->kiocb;
if ((kiocb->ki_flags & IOCB_DIO_CALLER_COMP) && kiocb->dio_complete) {
long res = kiocb->dio_complete(rw->kiocb.private);
io_req_set_res(req, io_fixup_rw_res(req, res), 0);
}
io_req_io_end(req);
if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) {
unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED;
unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED;
req->cqe.flags |= io_put_kbuf(req, issue_flags);
}
io_req_task_complete(req, locked);
io_req_task_complete(req, ts);
}
static void io_complete_rw(struct kiocb *kiocb, long res)
@ -300,11 +310,13 @@ static void io_complete_rw(struct kiocb *kiocb, long res)
struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb);
struct io_kiocb *req = cmd_to_io_kiocb(rw);
if (__io_complete_rw_common(req, res))
return;
io_req_set_res(req, io_fixup_rw_res(req, res), 0);
if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) {
if (__io_complete_rw_common(req, res))
return;
io_req_set_res(req, io_fixup_rw_res(req, res), 0);
}
req->io_task_work.func = io_req_rw_complete;
io_req_task_work_add(req);
__io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE);
}
static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
@ -332,7 +344,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
unsigned final_ret = io_fixup_rw_res(req, ret);
if (req->flags & REQ_F_CUR_POS)
if (ret >= 0 && req->flags & REQ_F_CUR_POS)
req->file->f_pos = rw->kiocb.ki_pos;
if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) {
if (!__io_complete_rw_common(req, ret)) {
@ -391,7 +403,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
rw->len = sqe_len;
}
ret = import_single_range(ddir, buf, sqe_len, s->fast_iov, iter);
ret = import_ubuf(ddir, buf, sqe_len, iter);
if (ret)
return ERR_PTR(ret);
return NULL;
@ -410,7 +422,7 @@ static inline int io_import_iovec(int rw, struct io_kiocb *req,
unsigned int issue_flags)
{
*iovec = __io_import_iovec(rw, req, s, issue_flags);
if (unlikely(IS_ERR(*iovec)))
if (IS_ERR(*iovec))
return PTR_ERR(*iovec);
iov_iter_save_state(&s->iter, &s->iter_state);
@ -447,23 +459,25 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
ppos = io_kiocb_ppos(kiocb);
while (iov_iter_count(iter)) {
struct iovec iovec;
void __user *addr;
size_t len;
ssize_t nr;
if (!iov_iter_is_bvec(iter)) {
iovec = iov_iter_iovec(iter);
if (iter_is_ubuf(iter)) {
addr = iter->ubuf + iter->iov_offset;
len = iov_iter_count(iter);
} else if (!iov_iter_is_bvec(iter)) {
addr = iter_iov_addr(iter);
len = iter_iov_len(iter);
} else {
iovec.iov_base = u64_to_user_ptr(rw->addr);
iovec.iov_len = rw->len;
addr = u64_to_user_ptr(rw->addr);
len = rw->len;
}
if (ddir == READ) {
nr = file->f_op->read(file, iovec.iov_base,
iovec.iov_len, ppos);
} else {
nr = file->f_op->write(file, iovec.iov_base,
iovec.iov_len, ppos);
}
if (ddir == READ)
nr = file->f_op->read(file, addr, len, ppos);
else
nr = file->f_op->write(file, addr, len, ppos);
if (nr < 0) {
if (!ret)
@ -479,7 +493,7 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter)
if (!rw->len)
break;
}
if (nr != iovec.iov_len)
if (nr != len)
break;
}
@ -495,15 +509,15 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
io->free_iovec = iovec;
io->bytes_done = 0;
/* can only be fixed buffers, no need to do anything */
if (iov_iter_is_bvec(iter))
if (iov_iter_is_bvec(iter) || iter_is_ubuf(iter))
return;
if (!iovec) {
unsigned iov_off = 0;
io->s.iter.iov = io->s.fast_iov;
if (iter->iov != fast_iov) {
iov_off = iter->iov - fast_iov;
io->s.iter.iov += iov_off;
io->s.iter.__iov = io->s.fast_iov;
if (iter->__iov != fast_iov) {
iov_off = iter_iov(iter) - fast_iov;
io->s.iter.__iov += iov_off;
}
if (io->s.fast_iov != fast_iov)
memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off,
@ -516,7 +530,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec,
static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
struct io_rw_state *s, bool force)
{
if (!force && !io_op_defs[req->opcode].prep_async)
if (!force && !io_cold_defs[req->opcode].prep_async)
return 0;
if (!req_has_async_data(req)) {
struct io_async_rw *iorw;
@ -664,8 +678,8 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
if (unlikely(!file || !(file->f_mode & mode)))
return -EBADF;
if (!io_req_ffs_set(req))
req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
if (!(req->flags & REQ_F_FIXED_FILE))
req->flags |= io_file_get_flags(file);
kiocb->ki_flags = iocb_flags(file);
ret = kiocb_set_rw_flags(kiocb, rw->flags);
@ -981,13 +995,6 @@ copy_iov:
return ret;
}
static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx)
{
io_commit_cqring_flush(ctx);
if (ctx->flags & IORING_SETUP_SQPOLL)
io_cqring_wake(ctx);
}
void io_rw_fail(struct io_kiocb *req)
{
int res;
@ -1058,24 +1065,17 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
if (!smp_load_acquire(&req->iopoll_completed))
break;
nr_events++;
if (unlikely(req->flags & REQ_F_CQE_SKIP))
continue;
req->cqe.flags = io_put_kbuf(req, 0);
if (unlikely(!__io_fill_cqe_req(ctx, req))) {
spin_lock(&ctx->completion_lock);
io_req_cqe_overflow(req);
spin_unlock(&ctx->completion_lock);
}
}
if (unlikely(!nr_events))
return 0;
io_commit_cqring(ctx);
io_cqring_ev_posted_iopoll(ctx);
pos = start ? start->next : ctx->iopoll_list.first;
wq_list_cut(&ctx->iopoll_list, prev, start);
io_free_batch_list(ctx, pos);
if (WARN_ON_ONCE(!wq_list_empty(&ctx->submit_state.compl_reqs)))
return 0;
ctx->submit_state.compl_reqs.first = pos;
__io_submit_flush_completions(ctx);
return nr_events;
}

View File

@ -22,3 +22,4 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags);
int io_writev_prep_async(struct io_kiocb *req);
void io_readv_writev_cleanup(struct io_kiocb *req);
void io_rw_fail(struct io_kiocb *req);
void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts);

View File

@ -3,6 +3,9 @@
#include <linux/io_uring_types.h>
#define __wq_list_for_each(pos, head) \
for (pos = (head)->first; pos; pos = (pos)->next)
#define wq_list_for_each(pos, prv, head) \
for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
@ -27,28 +30,6 @@ static inline void wq_list_add_after(struct io_wq_work_node *node,
list->last = node;
}
/**
* wq_list_merge - merge the second list to the first one.
* @list0: the first list
* @list1: the second list
* Return the first node after mergence.
*/
static inline struct io_wq_work_node *wq_list_merge(struct io_wq_work_list *list0,
struct io_wq_work_list *list1)
{
struct io_wq_work_node *ret;
if (!list0->first) {
ret = list1->first;
} else {
ret = list0->first;
list0->last->next = list1->first;
}
INIT_WQ_LIST(list0);
INIT_WQ_LIST(list1);
return ret;
}
static inline void wq_list_add_tail(struct io_wq_work_node *node,
struct io_wq_work_list *list)
{
@ -135,4 +116,4 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
return container_of(work->list.next, struct io_wq_work, list);
}
#endif // INTERNAL_IO_SLIST_H
#endif // INTERNAL_IO_SLIST_H

View File

@ -34,6 +34,7 @@ static int __io_splice_prep(struct io_kiocb *req,
if (unlikely(sp->flags & ~valid_flags))
return -EINVAL;
sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in);
req->flags |= REQ_F_FORCE_ASYNC;
return 0;
}
@ -52,8 +53,7 @@ int io_tee(struct io_kiocb *req, unsigned int issue_flags)
struct file *in;
long ret = 0;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
if (sp->flags & SPLICE_F_FD_IN_FIXED)
in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
@ -68,7 +68,7 @@ int io_tee(struct io_kiocb *req, unsigned int issue_flags)
ret = do_tee(in, out, sp->len, flags);
if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
io_put_file(in);
fput(in);
done:
if (ret != sp->len)
req_set_fail(req);
@ -94,8 +94,7 @@ int io_splice(struct io_kiocb *req, unsigned int issue_flags)
struct file *in;
long ret = 0;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
if (sp->flags & SPLICE_F_FD_IN_FIXED)
in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags);
@ -113,7 +112,7 @@ int io_splice(struct io_kiocb *req, unsigned int issue_flags)
ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
io_put_file(in);
fput(in);
done:
if (ret != sp->len)
req_set_fail(req);

View File

@ -255,9 +255,13 @@ static int io_sq_thread(void *data)
sqt_spin = true;
if (sqt_spin || !time_after(jiffies, timeout)) {
cond_resched();
if (sqt_spin)
timeout = jiffies + sqd->sq_thread_idle;
if (unlikely(need_resched())) {
mutex_unlock(&sqd->lock);
cond_resched();
mutex_lock(&sqd->lock);
}
continue;
}
@ -311,7 +315,7 @@ static int io_sq_thread(void *data)
do_exit(0);
}
int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
void io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
{
DEFINE_WAIT(wait);
@ -326,7 +330,6 @@ int io_sqpoll_wait_sq(struct io_ring_ctx *ctx)
} while (!signal_pending(current));
finish_wait(&ctx->sqo_sq_wait, &wait);
return 0;
}
__cold int io_sq_offload_create(struct io_ring_ctx *ctx,
@ -418,3 +421,20 @@ err:
io_sq_thread_finish(ctx);
return ret;
}
__cold int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx,
cpumask_var_t mask)
{
struct io_sq_data *sqd = ctx->sq_data;
int ret = -EINVAL;
if (sqd) {
io_sq_thread_park(sqd);
/* Don't set affinity for a dying thread */
if (sqd->thread)
ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask);
io_sq_thread_unpark(sqd);
}
return ret;
}

View File

@ -26,4 +26,5 @@ void io_sq_thread_stop(struct io_sq_data *sqd);
void io_sq_thread_park(struct io_sq_data *sqd);
void io_sq_thread_unpark(struct io_sq_data *sqd);
void io_put_sq_data(struct io_sq_data *sqd);
int io_sqpoll_wait_sq(struct io_ring_ctx *ctx);
void io_sqpoll_wait_sq(struct io_ring_ctx *ctx);
int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, cpumask_var_t mask);

View File

@ -48,6 +48,7 @@ int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
}
req->flags |= REQ_F_NEED_CLEANUP;
req->flags |= REQ_F_FORCE_ASYNC;
return 0;
}
@ -56,8 +57,7 @@ int io_statx(struct io_kiocb *req, unsigned int issue_flags)
struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx);
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer);
io_req_set_res(req, ret, 0);

View File

@ -32,6 +32,8 @@ int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sync->off = READ_ONCE(sqe->off);
sync->len = READ_ONCE(sqe->len);
sync->flags = READ_ONCE(sqe->sync_range_flags);
req->flags |= REQ_F_FORCE_ASYNC;
return 0;
}
@ -41,8 +43,7 @@ int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags)
int ret;
/* sync_file_range always requires a blocking context */
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
ret = sync_file_range(req->file, sync->off, sync->len, sync->flags);
io_req_set_res(req, ret, 0);
@ -62,6 +63,7 @@ int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sync->off = READ_ONCE(sqe->off);
sync->len = READ_ONCE(sqe->len);
req->flags |= REQ_F_FORCE_ASYNC;
return 0;
}
@ -72,8 +74,7 @@ int io_fsync(struct io_kiocb *req, unsigned int issue_flags)
int ret;
/* fsync always requires a blocking context */
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX,
sync->flags & IORING_FSYNC_DATASYNC);
@ -91,6 +92,7 @@ int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sync->off = READ_ONCE(sqe->off);
sync->len = READ_ONCE(sqe->addr);
sync->mode = READ_ONCE(sqe->len);
req->flags |= REQ_F_FORCE_ASYNC;
return 0;
}
@ -100,8 +102,8 @@ int io_fallocate(struct io_kiocb *req, unsigned int issue_flags)
int ret;
/* fallocate always requiring blocking context */
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
ret = vfs_fallocate(req->file, sync->mode, sync->off, sync->len);
if (ret >= 0)
fsnotify_modify(req->file);

View File

@ -83,7 +83,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
xa_init(&tctx->xa);
init_waitqueue_head(&tctx->wait);
atomic_set(&tctx->in_idle, 0);
atomic_set(&tctx->in_cancel, 0);
atomic_set(&tctx->inflight_tracked, 0);
task->io_uring = tctx;
init_llist_head(&tctx->task_list);
@ -208,29 +208,38 @@ void io_uring_unreg_ringfd(void)
}
}
int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
int start, int end)
{
int offset;
for (offset = start; offset < end; offset++) {
offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
if (tctx->registered_rings[offset])
continue;
tctx->registered_rings[offset] = file;
return offset;
}
return -EBUSY;
}
static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
int start, int end)
{
struct file *file;
int offset;
for (offset = start; offset < end; offset++) {
offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
if (tctx->registered_rings[offset])
continue;
file = fget(fd);
if (!file) {
return -EBADF;
} else if (!io_is_uring_fops(file)) {
fput(file);
return -EOPNOTSUPP;
}
tctx->registered_rings[offset] = file;
return offset;
file = fget(fd);
if (!file) {
return -EBADF;
} else if (!io_is_uring_fops(file)) {
fput(file);
return -EOPNOTSUPP;
}
return -EBUSY;
offset = io_ring_add_registered_file(tctx, file, start, end);
if (offset < 0)
fput(file);
return offset;
}
/*

View File

@ -17,6 +17,7 @@ struct io_timeout {
struct file *file;
u32 off;
u32 target_seq;
u32 repeats;
struct list_head list;
/* head of the link, used by linked timeouts only */
struct io_kiocb *head;
@ -37,8 +38,9 @@ struct io_timeout_rem {
static inline bool io_is_timeout_noseq(struct io_kiocb *req)
{
struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
struct io_timeout_data *data = req->async_data;
return !timeout->off;
return !timeout->off || data->flags & IORING_TIMEOUT_MULTISHOT;
}
static inline void io_put_req(struct io_kiocb *req)
@ -49,6 +51,44 @@ static inline void io_put_req(struct io_kiocb *req)
}
}
static inline bool io_timeout_finish(struct io_timeout *timeout,
struct io_timeout_data *data)
{
if (!(data->flags & IORING_TIMEOUT_MULTISHOT))
return true;
if (!timeout->off || (timeout->repeats && --timeout->repeats))
return false;
return true;
}
static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer);
static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts)
{
struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
struct io_timeout_data *data = req->async_data;
struct io_ring_ctx *ctx = req->ctx;
if (!io_timeout_finish(timeout, data)) {
bool filled;
filled = io_fill_cqe_req_aux(req, ts->locked, -ETIME,
IORING_CQE_F_MORE);
if (filled) {
/* re-arm timer */
spin_lock_irq(&ctx->timeout_lock);
list_add(&timeout->list, ctx->timeout_list.prev);
data->timer.function = io_timeout_fn;
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
spin_unlock_irq(&ctx->timeout_lock);
return;
}
}
io_req_task_complete(req, ts);
}
static bool io_kill_timeout(struct io_kiocb *req, int status)
__must_hold(&req->ctx->timeout_lock)
{
@ -101,9 +141,9 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx)
spin_unlock_irq(&ctx->timeout_lock);
}
static void io_req_tw_fail_links(struct io_kiocb *link, bool *locked)
static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts)
{
io_tw_lock(link->ctx, locked);
io_tw_lock(link->ctx, ts);
while (link) {
struct io_kiocb *nxt = link->link;
long res = -ECANCELED;
@ -112,7 +152,7 @@ static void io_req_tw_fail_links(struct io_kiocb *link, bool *locked)
res = link->cqe.res;
link->link = NULL;
io_req_set_res(link, res, 0);
io_req_task_complete(link, locked);
io_req_task_complete(link, ts);
link = nxt;
}
}
@ -212,7 +252,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
req_set_fail(req);
io_req_set_res(req, -ETIME, 0);
req->io_task_work.func = io_req_task_complete;
req->io_task_work.func = io_timeout_complete;
io_req_task_work_add(req);
return HRTIMER_NORESTART;
}
@ -228,16 +268,10 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx,
list_for_each_entry(timeout, &ctx->timeout_list, list) {
struct io_kiocb *tmp = cmd_to_io_kiocb(timeout);
if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) &&
cd->data != tmp->cqe.user_data)
continue;
if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) {
if (cd->seq == tmp->work.cancel_seq)
continue;
tmp->work.cancel_seq = cd->seq;
if (io_cancel_req_match(tmp, cd)) {
req = tmp;
break;
}
req = tmp;
break;
}
if (!req)
return ERR_PTR(-ENOENT);
@ -265,9 +299,9 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
return 0;
}
static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *ts)
{
unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED;
unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED;
struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
struct io_kiocb *prev = timeout->prev;
int ret = -ENOENT;
@ -282,11 +316,11 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked)
ret = io_try_cancel(req->task->io_uring, &cd, issue_flags);
}
io_req_set_res(req, ret ?: -ETIME, 0);
io_req_task_complete(req, locked);
io_req_task_complete(req, ts);
io_put_req(prev);
} else {
io_req_set_res(req, -ETIME, 0);
io_req_task_complete(req, locked);
io_req_task_complete(req, ts);
}
}
@ -369,7 +403,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data,
struct timespec64 *ts, enum hrtimer_mode mode)
__must_hold(&ctx->timeout_lock)
{
struct io_cancel_data cd = { .data = user_data, };
struct io_cancel_data cd = { .ctx = ctx, .data = user_data, };
struct io_kiocb *req = io_timeout_extract(ctx, &cd);
struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
struct io_timeout_data *data;
@ -433,7 +467,7 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags)
int ret;
if (!(tr->flags & IORING_TIMEOUT_UPDATE)) {
struct io_cancel_data cd = { .data = tr->addr, };
struct io_cancel_data cd = { .ctx = ctx, .data = tr->addr, };
spin_lock(&ctx->completion_lock);
ret = io_timeout_cancel(ctx, &cd);
@ -470,16 +504,27 @@ static int __io_timeout_prep(struct io_kiocb *req,
return -EINVAL;
flags = READ_ONCE(sqe->timeout_flags);
if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK |
IORING_TIMEOUT_ETIME_SUCCESS))
IORING_TIMEOUT_ETIME_SUCCESS |
IORING_TIMEOUT_MULTISHOT))
return -EINVAL;
/* more than one clock specified is invalid, obviously */
if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1)
return -EINVAL;
/* multishot requests only make sense with rel values */
if (!(~flags & (IORING_TIMEOUT_MULTISHOT | IORING_TIMEOUT_ABS)))
return -EINVAL;
INIT_LIST_HEAD(&timeout->list);
timeout->off = off;
if (unlikely(off && !req->ctx->off_timeout_used))
req->ctx->off_timeout_used = true;
/*
* for multishot reqs w/ fixed nr of repeats, repeats tracks the
* remaining nr
*/
timeout->repeats = 0;
if ((flags & IORING_TIMEOUT_MULTISHOT) && off > 0)
timeout->repeats = off;
if (WARN_ON_ONCE(req_has_async_data(req)))
return -EFAULT;
@ -543,7 +588,7 @@ int io_timeout(struct io_kiocb *req, unsigned int issue_flags)
goto add;
}
tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
tail = data_race(ctx->cached_cq_tail) - atomic_read(&ctx->cq_timeouts);
timeout->target_seq = tail + off;
/* Update the last seq here in case io_flush_timeouts() hasn't.

View File

@ -7,36 +7,44 @@
#include <linux/nospec.h>
#include <uapi/linux/io_uring.h>
#include <uapi/asm-generic/ioctls.h>
#include "io_uring.h"
#include "rsrc.h"
#include "uring_cmd.h"
static void io_uring_cmd_work(struct io_kiocb *req, bool *locked)
static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED;
unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED;
ioucmd->task_work_cb(ioucmd, issue_flags);
}
void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned))
void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned),
unsigned flags)
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
ioucmd->task_work_cb = task_work_cb;
req->io_task_work.func = io_uring_cmd_work;
io_req_task_work_add(req);
__io_req_task_work_add(req, flags);
}
EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task);
EXPORT_SYMBOL_GPL(__io_uring_cmd_do_in_task);
void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
void (*task_work_cb)(struct io_uring_cmd *, unsigned))
{
__io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE);
}
EXPORT_SYMBOL_GPL(io_uring_cmd_do_in_task_lazy);
static inline void io_req_set_cqe32_extra(struct io_kiocb *req,
u64 extra1, u64 extra2)
{
req->extra1 = extra1;
req->extra2 = extra2;
req->flags |= REQ_F_CQE32_INIT;
req->big_cqe.extra1 = extra1;
req->big_cqe.extra2 = extra2;
}
/*
@ -54,25 +62,24 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2,
io_req_set_res(req, ret, 0);
if (req->ctx->flags & IORING_SETUP_CQE32)
io_req_set_cqe32_extra(req, res2, 0);
if (req->ctx->flags & IORING_SETUP_IOPOLL)
if (req->ctx->flags & IORING_SETUP_IOPOLL) {
/* order with io_iopoll_req_issued() checking ->iopoll_complete */
smp_store_release(&req->iopoll_completed, 1);
else
io_req_complete_post(req, issue_flags);
} else {
struct io_tw_state ts = {
.locked = !(issue_flags & IO_URING_F_UNLOCKED),
};
io_req_task_complete(req, &ts);
}
}
EXPORT_SYMBOL_GPL(io_uring_cmd_done);
int io_uring_cmd_prep_async(struct io_kiocb *req)
{
struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
size_t cmd_size;
BUILD_BUG_ON(uring_cmd_pdu_size(0) != 16);
BUILD_BUG_ON(uring_cmd_pdu_size(1) != 80);
cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128);
memcpy(req->async_data, ioucmd->cmd, cmd_size);
memcpy(req->async_data, ioucmd->sqe, uring_sqe_size(req->ctx));
ioucmd->sqe = req->async_data;
return 0;
}
@ -98,7 +105,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->imu = ctx->user_bufs[index];
io_req_set_rsrc_node(req, ctx, 0);
}
ioucmd->cmd = sqe->cmd;
ioucmd->sqe = sqe;
ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
return 0;
}
@ -129,9 +136,6 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
WRITE_ONCE(ioucmd->cookie, NULL);
}
if (req_has_async_data(req))
ioucmd->cmd = req->async_data;
ret = file->f_op->uring_cmd(ioucmd, issue_flags);
if (ret == -EAGAIN) {
if (!req_has_async_data(req)) {
@ -160,3 +164,30 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
return io_import_fixed(rw, iter, req->imu, ubuf, len);
}
EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed);
int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct socket *sock = cmd->file->private_data;
struct sock *sk = sock->sk;
struct proto *prot = READ_ONCE(sk->sk_prot);
int ret, arg = 0;
if (!prot || !prot->ioctl)
return -EOPNOTSUPP;
switch (cmd->sqe->cmd_op) {
case SOCKET_URING_OP_SIOCINQ:
ret = prot->ioctl(sk, SIOCINQ, &arg);
if (ret)
return ret;
return arg;
case SOCKET_URING_OP_SIOCOUTQ:
ret = prot->ioctl(sk, SIOCOUTQ, &arg);
if (ret)
return ret;
return arg;
default:
return -EOPNOTSUPP;
}
}
EXPORT_SYMBOL_GPL(io_uring_cmd_sock);

View File

@ -3,11 +3,3 @@
int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags);
int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_uring_cmd_prep_async(struct io_kiocb *req);
/*
* The URING_CMD payload starts at 'cmd' in the first sqe, and continues into
* the following sqe if SQE128 is used.
*/
#define uring_cmd_pdu_size(is_sqe128) \
((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) - \
offsetof(struct io_uring_sqe, cmd))

View File

@ -75,6 +75,7 @@ static int __io_getxattr_prep(struct io_kiocb *req,
}
req->flags |= REQ_F_NEED_CLEANUP;
req->flags |= REQ_F_FORCE_ASYNC;
return 0;
}
@ -109,8 +110,7 @@ int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr);
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt),
req->file->f_path.dentry,
@ -127,8 +127,7 @@ int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
struct path path;
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
retry:
ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
@ -176,6 +175,7 @@ static int __io_setxattr_prep(struct io_kiocb *req,
}
req->flags |= REQ_F_NEED_CLEANUP;
req->flags |= REQ_F_FORCE_ASYNC;
return 0;
}
@ -224,8 +224,7 @@ int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
{
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
ret = __io_setxattr(req, issue_flags, &req->file->f_path);
io_xattr_finish(req, ret);
@ -239,8 +238,7 @@ int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
struct path path;
int ret;
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
retry:
ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);

View File

@ -126,13 +126,13 @@ __out: \
iterate_buf(i, n, base, len, off, \
i->ubuf, (I)) \
} else if (likely(iter_is_iovec(i))) { \
const struct iovec *iov = i->iov; \
const struct iovec *iov = iter_iov(i); \
void __user *base; \
size_t len; \
iterate_iovec(i, n, base, len, off, \
iov, (I)) \
i->nr_segs -= iov - i->iov; \
i->iov = iov; \
i->nr_segs -= iov - iter_iov(i); \
i->__iov = iov; \
} else if (iov_iter_is_bvec(i)) { \
const struct bio_vec *bvec = i->bvec; \
void *base; \
@ -361,7 +361,7 @@ size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size)
size_t skip;
size -= count;
for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) {
for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
size_t len = min(count, p->iov_len - skip);
size_t ret;
@ -404,7 +404,7 @@ size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
size_t skip;
size -= count;
for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) {
for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
size_t len = min(count, p->iov_len - skip);
size_t ret;
@ -431,7 +431,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
.nofault = false,
.user_backed = true,
.data_source = direction,
.iov = iov,
.__iov = iov,
.nr_segs = nr_segs,
.iov_offset = 0,
.count = count
@ -881,14 +881,14 @@ static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
i->count -= size;
size += i->iov_offset; // from beginning of current segment
for (iov = i->iov, end = iov + i->nr_segs; iov < end; iov++) {
for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) {
if (likely(size < iov->iov_len))
break;
size -= iov->iov_len;
}
i->iov_offset = size;
i->nr_segs -= iov - i->iov;
i->iov = iov;
i->nr_segs -= iov - iter_iov(i);
i->__iov = iov;
}
void iov_iter_advance(struct iov_iter *i, size_t size)
@ -963,12 +963,12 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll)
unroll -= n;
}
} else { /* same logics for iovec and kvec */
const struct iovec *iov = i->iov;
const struct iovec *iov = iter_iov(i);
while (1) {
size_t n = (--iov)->iov_len;
i->nr_segs++;
if (unroll <= n) {
i->iov = iov;
i->__iov = iov;
i->iov_offset = n - unroll;
return;
}
@ -985,7 +985,7 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i)
{
if (i->nr_segs > 1) {
if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
return min(i->count, i->iov->iov_len - i->iov_offset);
return min(i->count, iter_iov(i)->iov_len - i->iov_offset);
if (iov_iter_is_bvec(i))
return min(i->count, i->bvec->bv_len - i->iov_offset);
}
@ -1100,13 +1100,14 @@ static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask,
unsigned k;
for (k = 0; k < i->nr_segs; k++, skip = 0) {
size_t len = i->iov[k].iov_len - skip;
const struct iovec *iov = iter_iov(i) + k;
size_t len = iov->iov_len - skip;
if (len > size)
len = size;
if (len & len_mask)
return false;
if ((unsigned long)(i->iov[k].iov_base + skip) & addr_mask)
if ((unsigned long)(iov->iov_base + skip) & addr_mask)
return false;
size -= len;
@ -1199,9 +1200,10 @@ static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
unsigned k;
for (k = 0; k < i->nr_segs; k++, skip = 0) {
size_t len = i->iov[k].iov_len - skip;
const struct iovec *iov = iter_iov(i) + k;
size_t len = iov->iov_len - skip;
if (len) {
res |= (unsigned long)i->iov[k].iov_base + skip;
res |= (unsigned long)iov->iov_base + skip;
if (len > size)
len = size;
res |= len;
@ -1278,14 +1280,15 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
return ~0U;
for (k = 0; k < i->nr_segs; k++) {
if (i->iov[k].iov_len) {
unsigned long base = (unsigned long)i->iov[k].iov_base;
const struct iovec *iov = iter_iov(i) + k;
if (iov->iov_len) {
unsigned long base = (unsigned long)iov->iov_base;
if (v) // if not the first one
res |= base | v; // this start | previous end
v = base + i->iov[k].iov_len;
if (size <= i->iov[k].iov_len)
v = base + iov->iov_len;
if (size <= iov->iov_len)
break;
size -= i->iov[k].iov_len;
size -= iov->iov_len;
}
}
return res;
@ -1401,13 +1404,14 @@ static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size)
return (unsigned long)i->ubuf + i->iov_offset;
for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
size_t len = i->iov[k].iov_len - skip;
const struct iovec *iov = iter_iov(i) + k;
size_t len = iov->iov_len - skip;
if (unlikely(!len))
continue;
if (*size > len)
*size = len;
return (unsigned long)i->iov[k].iov_base + skip;
return (unsigned long)iov->iov_base + skip;
}
BUG(); // if it had been empty, we wouldn't get called
}
@ -1596,7 +1600,7 @@ static int iov_npages(const struct iov_iter *i, int maxpages)
const struct iovec *p;
int npages = 0;
for (p = i->iov; size; skip = 0, p++) {
for (p = iter_iov(i); size; skip = 0, p++) {
unsigned offs = offset_in_page(p->iov_base + skip);
size_t len = min(p->iov_len - skip, size);
@ -1673,7 +1677,7 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
flags);
else if (iov_iter_is_kvec(new) || iter_is_iovec(new))
/* iovec and kvec have identical layout */
return new->iov = kmemdup(new->iov,
return new->__iov = kmemdup(new->__iov,
new->nr_segs * sizeof(struct iovec),
flags);
return NULL;
@ -1855,6 +1859,17 @@ int import_single_range(int rw, void __user *buf, size_t len,
}
EXPORT_SYMBOL(import_single_range);
int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i)
{
if (len > MAX_RW_COUNT)
len = MAX_RW_COUNT;
if (unlikely(!access_ok(buf, len)))
return -EFAULT;
iov_iter_ubuf(i, rw, buf, len);
return 0;
}
/**
* iov_iter_restore() - Restore a &struct iov_iter to the same state as when
* iov_iter_save_state() was called.
@ -1869,8 +1884,8 @@ EXPORT_SYMBOL(import_single_range);
*/
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
{
if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) &&
!iov_iter_is_kvec(i) && !iter_is_ubuf(i))
if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) &&
!iter_is_ubuf(i)) && !iov_iter_is_kvec(i))
return;
i->iov_offset = state->iov_offset;
i->count = state->count;
@ -1889,6 +1904,6 @@ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
if (iov_iter_is_bvec(i))
i->bvec -= state->nr_segs - i->nr_segs;
else
i->iov -= state->nr_segs - i->nr_segs;
i->__iov -= state->nr_segs - i->nr_segs;
i->nr_segs = state->nr_segs;
}

View File

@ -1478,7 +1478,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
size_t, vlen, int, behavior, unsigned int, flags)
{
ssize_t ret;
struct iovec iovstack[UIO_FASTIOV], iovec;
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
struct iov_iter iter;
struct task_struct *task;
@ -1525,12 +1525,11 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
total_len = iov_iter_count(&iter);
while (iov_iter_count(&iter)) {
iovec = iov_iter_iovec(&iter);
ret = do_madvise(mm, (unsigned long)iovec.iov_base,
iovec.iov_len, behavior);
ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
iter_iov_len(&iter), behavior);
if (ret < 0)
break;
iov_iter_advance(&iter, iovec.iov_len);
iov_iter_advance(&iter, iter_iov_len(&iter));
}
ret = (total_len - iov_iter_count(&iter)) ? : ret;

View File

@ -975,9 +975,10 @@ static int do_mmap_private(struct vm_area_struct *vma,
*/
if (capabilities & NOMMU_MAP_DIRECT) {
ret = call_mmap(vma->vm_file, vma);
/* shouldn't return success if we're not sharing */
if (WARN_ON_ONCE(!is_nommu_shared_mapping(vma->vm_flags)))
ret = -ENOSYS;
if (ret == 0) {
/* shouldn't return success if we're not sharing */
BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
vma->vm_region->vm_top = vma->vm_region->vm_end;
return 0;
}
@ -1118,7 +1119,7 @@ unsigned long do_mmap(struct file *file,
* these cases, sharing is handled in the driver or filesystem rather
* than here
*/
if (vm_flags & VM_MAYSHARE) {
if (is_nommu_shared_mapping(vm_flags)) {
struct vm_region *pregion;
unsigned long pglen, rpglen, pgend, rpgend, start;
@ -1128,7 +1129,7 @@ unsigned long do_mmap(struct file *file,
for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) {
pregion = rb_entry(rb, struct vm_region, vm_rb);
if (!(pregion->vm_flags & VM_MAYSHARE))
if (!is_nommu_shared_mapping(pregion->vm_flags))
continue;
/* search for overlapping mappings on the same file */
@ -1575,7 +1576,7 @@ static unsigned long do_mremap(unsigned long addr,
if (vma->vm_end != vma->vm_start + old_len)
return (unsigned long) -EFAULT;
if (vma->vm_flags & VM_MAYSHARE)
if (is_nommu_shared_mapping(vma->vm_flags))
return (unsigned long) -EPERM;
if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start)

View File

@ -114,6 +114,9 @@
#include <linux/memcontrol.h>
#include <linux/prefetch.h>
#include <linux/compat.h>
#include <linux/mroute.h>
#include <linux/mroute6.h>
#include <linux/icmpv6.h>
#include <linux/uaccess.h>
@ -138,6 +141,7 @@
#include <net/tcp.h>
#include <net/busy_poll.h>
#include <net/phonet/phonet.h>
#include <linux/ethtool.h>
@ -4028,3 +4032,63 @@ int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
return sk->sk_prot->bind_add(sk, addr, addr_len);
}
EXPORT_SYMBOL(sock_bind_add);
/* Copy 'size' bytes from userspace and return `size` back to userspace */
int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
void __user *arg, void *karg, size_t size)
{
int ret;
if (copy_from_user(karg, arg, size))
return -EFAULT;
ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
if (ret)
return ret;
if (copy_to_user(arg, karg, size))
return -EFAULT;
return 0;
}
EXPORT_SYMBOL(sock_ioctl_inout);
/* This is the most common ioctl prep function, where the result (4 bytes) is
* copied back to userspace if the ioctl() returns successfully. No input is
* copied from userspace as input argument.
*/
static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
{
int ret, karg = 0;
ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
if (ret)
return ret;
return put_user(karg, (int __user *)arg);
}
/* A wrapper around sock ioctls, which copies the data from userspace
* (depending on the protocol/ioctl), and copies back the result to userspace.
* The main motivation for this function is to pass kernel memory to the
* protocol ioctl callbacks, instead of userspace memory.
*/
int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
{
int rc = 1;
if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
rc = ipmr_sk_ioctl(sk, cmd, arg);
else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
rc = ip6mr_sk_ioctl(sk, cmd, arg);
else if (sk_is_phonet(sk))
rc = phonet_sk_ioctl(sk, cmd, arg);
/* If ioctl was processed, returns its value */
if (rc <= 0)
return rc;
/* Otherwise call the default handler */
return sock_ioctl_out(sk, cmd, arg);
}
EXPORT_SYMBOL(sk_ioctl);

View File

@ -297,7 +297,7 @@ int dccp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen);
int dccp_setsockopt(struct sock *sk, int level, int optname,
sockptr_t optval, unsigned int optlen);
int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg);
int dccp_ioctl(struct sock *sk, int cmd, int *karg);
int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
int flags, int *addr_len);

View File

@ -371,7 +371,7 @@ __poll_t dccp_poll(struct file *file, struct socket *sock,
EXPORT_SYMBOL_GPL(dccp_poll);
int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
int dccp_ioctl(struct sock *sk, int cmd, int *karg)
{
int rc = -ENOTCONN;
@ -382,17 +382,17 @@ int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
switch (cmd) {
case SIOCOUTQ: {
int amount = sk_wmem_alloc_get(sk);
*karg = sk_wmem_alloc_get(sk);
/* Using sk_wmem_alloc here because sk_wmem_queued is not used by DCCP and
* always 0, comparably to UDP.
*/
rc = put_user(amount, (int __user *)arg);
rc = 0;
}
break;
case SIOCINQ: {
struct sk_buff *skb;
unsigned long amount = 0;
*karg = 0;
skb = skb_peek(&sk->sk_receive_queue);
if (skb != NULL) {
@ -400,9 +400,9 @@ int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
* We will only return the amount of this packet since
* that is all that will be read.
*/
amount = skb->len;
*karg = skb->len;
}
rc = put_user(amount, (int __user *)arg);
rc = 0;
}
break;
default:

View File

@ -162,7 +162,7 @@ static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd,
default:
if (!sk->sk_prot->ioctl)
return -ENOIOCTLCMD;
return sk->sk_prot->ioctl(sk, cmd, arg);
return sk_ioctl(sk, cmd, (void __user *)arg);
}
}
@ -524,22 +524,21 @@ out:
return err;
}
static int dgram_ioctl(struct sock *sk, int cmd, unsigned long arg)
static int dgram_ioctl(struct sock *sk, int cmd, int *karg)
{
switch (cmd) {
case SIOCOUTQ:
{
int amount = sk_wmem_alloc_get(sk);
*karg = sk_wmem_alloc_get(sk);
return put_user(amount, (int __user *)arg);
return 0;
}
case SIOCINQ:
{
struct sk_buff *skb;
unsigned long amount;
amount = 0;
*karg = 0;
spin_lock_bh(&sk->sk_receive_queue.lock);
skb = skb_peek(&sk->sk_receive_queue);
if (skb) {
@ -547,10 +546,10 @@ static int dgram_ioctl(struct sock *sk, int cmd, unsigned long arg)
* of this packet since that is all
* that will be read.
*/
amount = skb->len - ieee802154_hdr_length(skb);
*karg = skb->len - ieee802154_hdr_length(skb);
}
spin_unlock_bh(&sk->sk_receive_queue.lock);
return put_user(amount, (int __user *)arg);
return 0;
}
}

View File

@ -1004,7 +1004,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
break;
default:
if (sk->sk_prot->ioctl)
err = sk->sk_prot->ioctl(sk, cmd, arg);
err = sk_ioctl(sk, cmd, (void __user *)arg);
else
err = -ENOIOCTLCMD;
break;

View File

@ -1540,6 +1540,28 @@ out:
return ret;
}
/* Execute if this ioctl is a special mroute ioctl */
int ipmr_sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
{
switch (cmd) {
/* These userspace buffers will be consumed by ipmr_ioctl() */
case SIOCGETVIFCNT: {
struct sioc_vif_req buffer;
return sock_ioctl_inout(sk, cmd, arg, &buffer,
sizeof(buffer));
}
case SIOCGETSGCNT: {
struct sioc_sg_req buffer;
return sock_ioctl_inout(sk, cmd, arg, &buffer,
sizeof(buffer));
}
}
/* return code > 0 means that the ioctl was not executed */
return 1;
}
/* Getsock opt support for the multicast routing system. */
int ip_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval,
sockptr_t optlen)
@ -1586,13 +1608,13 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval,
}
/* The IP multicast ioctl support routines. */
int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
int ipmr_ioctl(struct sock *sk, int cmd, void *arg)
{
struct sioc_sg_req sr;
struct sioc_vif_req vr;
struct vif_device *vif;
struct mfc_cache *c;
struct net *net = sock_net(sk);
struct sioc_vif_req *vr;
struct sioc_sg_req *sr;
struct mr_table *mrt;
mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
@ -1601,40 +1623,33 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
switch (cmd) {
case SIOCGETVIFCNT:
if (copy_from_user(&vr, arg, sizeof(vr)))
return -EFAULT;
if (vr.vifi >= mrt->maxvif)
vr = (struct sioc_vif_req *)arg;
if (vr->vifi >= mrt->maxvif)
return -EINVAL;
vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif);
vr->vifi = array_index_nospec(vr->vifi, mrt->maxvif);
read_lock(&mrt_lock);
vif = &mrt->vif_table[vr.vifi];
if (VIF_EXISTS(mrt, vr.vifi)) {
vr.icount = vif->pkt_in;
vr.ocount = vif->pkt_out;
vr.ibytes = vif->bytes_in;
vr.obytes = vif->bytes_out;
vif = &mrt->vif_table[vr->vifi];
if (VIF_EXISTS(mrt, vr->vifi)) {
vr->icount = vif->pkt_in;
vr->ocount = vif->pkt_out;
vr->ibytes = vif->bytes_in;
vr->obytes = vif->bytes_out;
read_unlock(&mrt_lock);
if (copy_to_user(arg, &vr, sizeof(vr)))
return -EFAULT;
return 0;
}
read_unlock(&mrt_lock);
return -EADDRNOTAVAIL;
case SIOCGETSGCNT:
if (copy_from_user(&sr, arg, sizeof(sr)))
return -EFAULT;
sr = (struct sioc_sg_req *)arg;
rcu_read_lock();
c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
c = ipmr_cache_find(mrt, sr->src.s_addr, sr->grp.s_addr);
if (c) {
sr.pktcnt = c->_c.mfc_un.res.pkt;
sr.bytecnt = c->_c.mfc_un.res.bytes;
sr.wrong_if = c->_c.mfc_un.res.wrong_if;
sr->pktcnt = c->_c.mfc_un.res.pkt;
sr->bytecnt = c->_c.mfc_un.res.bytes;
sr->wrong_if = c->_c.mfc_un.res.wrong_if;
rcu_read_unlock();
if (copy_to_user(arg, &sr, sizeof(sr)))
return -EFAULT;
return 0;
}
rcu_read_unlock();

View File

@ -855,29 +855,29 @@ static int raw_getsockopt(struct sock *sk, int level, int optname,
return do_raw_getsockopt(sk, level, optname, optval, optlen);
}
static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
static int raw_ioctl(struct sock *sk, int cmd, int *karg)
{
switch (cmd) {
case SIOCOUTQ: {
int amount = sk_wmem_alloc_get(sk);
return put_user(amount, (int __user *)arg);
*karg = sk_wmem_alloc_get(sk);
return 0;
}
case SIOCINQ: {
struct sk_buff *skb;
int amount = 0;
spin_lock_bh(&sk->sk_receive_queue.lock);
skb = skb_peek(&sk->sk_receive_queue);
if (skb)
amount = skb->len;
*karg = skb->len;
else
*karg = 0;
spin_unlock_bh(&sk->sk_receive_queue.lock);
return put_user(amount, (int __user *)arg);
return 0;
}
default:
#ifdef CONFIG_IP_MROUTE
return ipmr_ioctl(sk, cmd, (void __user *)arg);
return ipmr_ioctl(sk, cmd, karg);
#else
return -ENOIOCTLCMD;
#endif

View File

@ -596,7 +596,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
}
EXPORT_SYMBOL(tcp_poll);
int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
int tcp_ioctl(struct sock *sk, int cmd, int *karg)
{
struct tcp_sock *tp = tcp_sk(sk);
int answ;
@ -638,7 +638,8 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
return -ENOIOCTLCMD;
}
return put_user(answ, (int __user *)arg);
*karg = answ;
return 0;
}
EXPORT_SYMBOL(tcp_ioctl);

View File

@ -1717,21 +1717,19 @@ static int first_packet_length(struct sock *sk)
* IOCTL requests applicable to the UDP protocol
*/
int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
int udp_ioctl(struct sock *sk, int cmd, int *karg)
{
switch (cmd) {
case SIOCOUTQ:
{
int amount = sk_wmem_alloc_get(sk);
return put_user(amount, (int __user *)arg);
*karg = sk_wmem_alloc_get(sk);
return 0;
}
case SIOCINQ:
{
int amount = max_t(int, 0, first_packet_length(sk));
return put_user(amount, (int __user *)arg);
*karg = max_t(int, 0, first_packet_length(sk));
return 0;
}
default:

View File

@ -594,7 +594,7 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
prot = READ_ONCE(sk->sk_prot);
if (!prot->ioctl)
return -ENOIOCTLCMD;
return prot->ioctl(sk, cmd, arg);
return sk_ioctl(sk, cmd, (void __user *)arg);
}
/*NOTREACHED*/
return 0;

View File

@ -1853,11 +1853,10 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval,
/*
* The IP multicast ioctl support routines.
*/
int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
int ip6mr_ioctl(struct sock *sk, int cmd, void *arg)
{
struct sioc_sg_req6 sr;
struct sioc_mif_req6 vr;
struct sioc_sg_req6 *sr;
struct sioc_mif_req6 *vr;
struct vif_device *vif;
struct mfc6_cache *c;
struct net *net = sock_net(sk);
@ -1869,40 +1868,33 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
switch (cmd) {
case SIOCGETMIFCNT_IN6:
if (copy_from_user(&vr, arg, sizeof(vr)))
return -EFAULT;
if (vr.mifi >= mrt->maxvif)
vr = (struct sioc_mif_req6 *)arg;
if (vr->mifi >= mrt->maxvif)
return -EINVAL;
vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif);
vr->mifi = array_index_nospec(vr->mifi, mrt->maxvif);
read_lock(&mrt_lock);
vif = &mrt->vif_table[vr.mifi];
if (VIF_EXISTS(mrt, vr.mifi)) {
vr.icount = vif->pkt_in;
vr.ocount = vif->pkt_out;
vr.ibytes = vif->bytes_in;
vr.obytes = vif->bytes_out;
vif = &mrt->vif_table[vr->mifi];
if (VIF_EXISTS(mrt, vr->mifi)) {
vr->icount = vif->pkt_in;
vr->ocount = vif->pkt_out;
vr->ibytes = vif->bytes_in;
vr->obytes = vif->bytes_out;
read_unlock(&mrt_lock);
if (copy_to_user(arg, &vr, sizeof(vr)))
return -EFAULT;
return 0;
}
read_unlock(&mrt_lock);
return -EADDRNOTAVAIL;
case SIOCGETSGCNT_IN6:
if (copy_from_user(&sr, arg, sizeof(sr)))
return -EFAULT;
sr = (struct sioc_sg_req6 *)arg;
rcu_read_lock();
c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr);
c = ip6mr_cache_find(mrt, &sr->src.sin6_addr,
&sr->grp.sin6_addr);
if (c) {
sr.pktcnt = c->_c.mfc_un.res.pkt;
sr.bytecnt = c->_c.mfc_un.res.bytes;
sr.wrong_if = c->_c.mfc_un.res.wrong_if;
sr->pktcnt = c->_c.mfc_un.res.pkt;
sr->bytecnt = c->_c.mfc_un.res.bytes;
sr->wrong_if = c->_c.mfc_un.res.wrong_if;
rcu_read_unlock();
if (copy_to_user(arg, &sr, sizeof(sr)))
return -EFAULT;
return 0;
}
rcu_read_unlock();

View File

@ -1116,29 +1116,29 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname,
return do_rawv6_getsockopt(sk, level, optname, optval, optlen);
}
static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg)
static int rawv6_ioctl(struct sock *sk, int cmd, int *karg)
{
switch (cmd) {
case SIOCOUTQ: {
int amount = sk_wmem_alloc_get(sk);
return put_user(amount, (int __user *)arg);
*karg = sk_wmem_alloc_get(sk);
return 0;
}
case SIOCINQ: {
struct sk_buff *skb;
int amount = 0;
spin_lock_bh(&sk->sk_receive_queue.lock);
skb = skb_peek(&sk->sk_receive_queue);
if (skb)
amount = skb->len;
*karg = skb->len;
else
*karg = 0;
spin_unlock_bh(&sk->sk_receive_queue.lock);
return put_user(amount, (int __user *)arg);
return 0;
}
default:
#ifdef CONFIG_IPV6_MROUTE
return ip6mr_ioctl(sk, cmd, (void __user *)arg);
return ip6mr_ioctl(sk, cmd, karg);
#else
return -ENOIOCTLCMD;
#endif

View File

@ -272,7 +272,7 @@ int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops
void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type);
/* IOCTL helper for IP encap modules. */
int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg);
int l2tp_ioctl(struct sock *sk, int cmd, int *karg);
/* Extract the tunnel structure from a socket's sk_user_data pointer,
* validating the tunnel magic feather.

View File

@ -563,19 +563,18 @@ out:
return err ? err : copied;
}
int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg)
int l2tp_ioctl(struct sock *sk, int cmd, int *karg)
{
struct sk_buff *skb;
int amount;
switch (cmd) {
case SIOCOUTQ:
amount = sk_wmem_alloc_get(sk);
*karg = sk_wmem_alloc_get(sk);
break;
case SIOCINQ:
spin_lock_bh(&sk->sk_receive_queue.lock);
skb = skb_peek(&sk->sk_receive_queue);
amount = skb ? skb->len : 0;
*karg = skb ? skb->len : 0;
spin_unlock_bh(&sk->sk_receive_queue.lock);
break;
@ -583,7 +582,7 @@ int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg)
return -ENOIOCTLCMD;
}
return put_user(amount, (int __user *)arg);
return 0;
}
EXPORT_SYMBOL_GPL(l2tp_ioctl);

View File

@ -3620,11 +3620,10 @@ static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v)
return (int)delta;
}
static int mptcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
static int mptcp_ioctl(struct sock *sk, int cmd, int *karg)
{
struct mptcp_sock *msk = mptcp_sk(sk);
bool slow;
int answ;
switch (cmd) {
case SIOCINQ:
@ -3633,24 +3632,24 @@ static int mptcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
lock_sock(sk);
__mptcp_move_skbs(msk);
answ = mptcp_inq_hint(sk);
*karg = mptcp_inq_hint(sk);
release_sock(sk);
break;
case SIOCOUTQ:
slow = lock_sock_fast(sk);
answ = mptcp_ioctl_outq(msk, READ_ONCE(msk->snd_una));
*karg = mptcp_ioctl_outq(msk, READ_ONCE(msk->snd_una));
unlock_sock_fast(sk, slow);
break;
case SIOCOUTQNSD:
slow = lock_sock_fast(sk);
answ = mptcp_ioctl_outq(msk, msk->snd_nxt);
*karg = mptcp_ioctl_outq(msk, msk->snd_nxt);
unlock_sock_fast(sk, slow);
break;
default:
return -ENOIOCTLCMD;
}
return put_user(answ, (int __user *)arg);
return 0;
}
static void mptcp_subflow_early_fallback(struct mptcp_sock *msk,

View File

@ -28,24 +28,21 @@ static void pn_sock_close(struct sock *sk, long timeout)
sk_common_release(sk);
}
static int pn_ioctl(struct sock *sk, int cmd, unsigned long arg)
static int pn_ioctl(struct sock *sk, int cmd, int *karg)
{
struct sk_buff *skb;
int answ;
switch (cmd) {
case SIOCINQ:
lock_sock(sk);
skb = skb_peek(&sk->sk_receive_queue);
answ = skb ? skb->len : 0;
*karg = skb ? skb->len : 0;
release_sock(sk);
return put_user(answ, (int __user *)arg);
return 0;
case SIOCPNADDRESOURCE:
case SIOCPNDELRESOURCE: {
u32 res;
if (get_user(res, (u32 __user *)arg))
return -EFAULT;
u32 res = *karg;
if (res >= 256)
return -EINVAL;
if (cmd == SIOCPNADDRESOURCE)

View File

@ -916,10 +916,9 @@ static int pep_sock_enable(struct sock *sk, struct sockaddr *addr, int len)
return 0;
}
static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg)
static int pep_ioctl(struct sock *sk, int cmd, int *karg)
{
struct pep_sock *pn = pep_sk(sk);
int answ;
int ret = -ENOIOCTLCMD;
switch (cmd) {
@ -932,13 +931,13 @@ static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg)
lock_sock(sk);
if (sock_flag(sk, SOCK_URGINLINE) &&
!skb_queue_empty(&pn->ctrlreq_queue))
answ = skb_peek(&pn->ctrlreq_queue)->len;
*karg = skb_peek(&pn->ctrlreq_queue)->len;
else if (!skb_queue_empty(&sk->sk_receive_queue))
answ = skb_peek(&sk->sk_receive_queue)->len;
*karg = skb_peek(&sk->sk_receive_queue)->len;
else
answ = 0;
*karg = 0;
release_sock(sk);
ret = put_user(answ, (int __user *)arg);
ret = 0;
break;
case SIOCPNENABLEPIPE:

View File

@ -387,7 +387,7 @@ static int pn_socket_ioctl(struct socket *sock, unsigned int cmd,
return put_user(handle, (__u16 __user *)arg);
}
return sk->sk_prot->ioctl(sk, cmd, arg);
return sk_ioctl(sk, cmd, (void __user *)arg);
}
static int pn_socket_listen(struct socket *sock, int backlog)

Some files were not shown because too many files have changed in this diff Show More