fuse: fix deadlock between atomic O_TRUNC and page invalidation

Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2207472
Upstream status: Linus

commit 2fdbb8dd01556e1501132b5ad3826e8f71e24a8b
Author: Miklos Szeredi <mszeredi@redhat.com>
Date:   Fri Apr 22 15:48:53 2022 +0200

    fuse: fix deadlock between atomic O_TRUNC and page invalidation
    
    fuse_finish_open() will be called with FUSE_NOWRITE set in case of atomic
    O_TRUNC open(), so commit 76224355db75 ("fuse: truncate pagecache on
    atomic_o_trunc") replaced invalidate_inode_pages2() by truncate_pagecache()
    in such a case to avoid the A-A deadlock. However, we found another A-B-B-A
    deadlock related to the case above, which will cause the xfstests
    generic/464 testcase hung in our virtio-fs test environment.
    
    For example, consider two processes concurrently open one same file, one
    with O_TRUNC and another without O_TRUNC. The deadlock case is described
    below, if open(O_TRUNC) is already set_nowrite(acquired A), and is trying
    to lock a page (acquiring B), open() could have held the page lock
    (acquired B), and waiting on the page writeback (acquiring A). This would
    lead to deadlocks.
    
    open(O_TRUNC)
    ----------------------------------------------------------------
    fuse_open_common
      inode_lock            [C acquire]
      fuse_set_nowrite      [A acquire]
    
      fuse_finish_open
        truncate_pagecache
          lock_page         [B acquire]
          truncate_inode_page
          unlock_page       [B release]
    
      fuse_release_nowrite  [A release]
      inode_unlock          [C release]
    ----------------------------------------------------------------
    
    open()
    ----------------------------------------------------------------
    fuse_open_common
      fuse_finish_open
        invalidate_inode_pages2
          lock_page         [B acquire]
            fuse_launder_page
              fuse_wait_on_page_writeback [A acquire & release]
          unlock_page       [B release]
    ----------------------------------------------------------------
    
    Besides this case, all calls of invalidate_inode_pages2() and
    invalidate_inode_pages2_range() in fuse code also can deadlock with
    open(O_TRUNC).
    
    Fix by moving the truncate_pagecache() call outside the nowrite protected
    region.  The nowrite protection is only for delayed writeback
    (writeback_cache) case, where inode lock does not protect against
    truncation racing with writes on the server.  Write syscalls racing with
    page cache truncation still get the inode lock protection.
    
    This patch also changes the order of filemap_invalidate_lock()
    vs. fuse_set_nowrite() in fuse_open_common().  This new order matches the
    order found in fuse_file_fallocate() and fuse_do_setattr().
    
    Reported-by: Jiachen Zhang <zhangjiachen.jaycee@bytedance.com>
    Tested-by: Jiachen Zhang <zhangjiachen.jaycee@bytedance.com>
    Fixes: e4648309b8 ("fuse: truncate pending writes on O_TRUNC")
    Cc: <stable@vger.kernel.org>
    Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>

Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
----------------------------------------------------------------
fuse_open_common
  inode_lock            [C acquire]
  fuse_set_nowrite      [A acquire]

  fuse_finish_open
    truncate_pagecache
      lock_page         [B acquire]
      truncate_inode_page
      unlock_page       [B release]

  fuse_release_nowrite  [A release]
  inode_unlock          [C release]
----------------------------------------------------------------

open()
----------------------------------------------------------------
fuse_open_common
  fuse_finish_open
    invalidate_inode_pages2
      lock_page         [B acquire]
        fuse_launder_page
          fuse_wait_on_page_writeback [A acquire & release]
      unlock_page       [B release]
----------------------------------------------------------------

Besides this case, all calls of invalidate_inode_pages2() and
invalidate_inode_pages2_range() in fuse code also can deadlock with
open(O_TRUNC).

Fix by moving the truncate_pagecache() call outside the nowrite protected
region.  The nowrite protection is only for delayed writeback
(writeback_cache) case, where inode lock does not protect against
truncation racing with writes on the server.  Write syscalls racing with
page cache truncation still get the inode lock protection.

This patch also changes the order of filemap_invalidate_lock()
vs. fuse_set_nowrite() in fuse_open_common().  This new order matches the
order found in fuse_file_fallocate() and fuse_do_setattr().

Reported-by: Jiachen Zhang <zhangjiachen.jaycee@bytedance.com>
Tested-by: Jiachen Zhang <zhangjiachen.jaycee@bytedance.com>
Fixes: e4648309b8 ("fuse: truncate pending writes on O_TRUNC")
Cc: <stable@vger.kernel.org>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
(cherry picked from commit 2fdbb8dd01556e1501132b5ad3826e8f71e24a8b)
This commit is contained in:
Miklos Szeredi 2023-06-09 16:28:59 +02:00
parent 36e12ff997
commit 9afe2bf7b5
2 changed files with 23 additions and 14 deletions

View File

@ -537,6 +537,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
struct fuse_file *ff;
void *security_ctx = NULL;
u32 security_ctxlen;
bool trunc = flags & O_TRUNC;
/* Userspace expects S_IFREG in create mode */
BUG_ON((mode & S_IFMT) != S_IFREG);
@ -561,7 +562,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
inarg.mode = mode;
inarg.umask = current_umask();
if (fm->fc->handle_killpriv_v2 && (flags & O_TRUNC) &&
if (fm->fc->handle_killpriv_v2 && trunc &&
!(flags & O_EXCL) && !capable(CAP_FSETID)) {
inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
}
@ -623,6 +624,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
} else {
file->private_data = ff;
fuse_finish_open(inode, file);
if (fm->fc->atomic_o_trunc && trunc)
truncate_pagecache(inode, 0);
else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
invalidate_inode_pages2(inode->i_mapping);
}
return err;

View File

@ -210,14 +210,10 @@ void fuse_finish_open(struct inode *inode, struct file *file)
fi->attr_version = atomic64_inc_return(&fc->attr_version);
i_size_write(inode, 0);
spin_unlock(&fi->lock);
truncate_pagecache(inode, 0);
fuse_invalidate_attr(inode);
if (fc->writeback_cache)
file_update_time(file);
} else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) {
invalidate_inode_pages2(inode->i_mapping);
}
if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
fuse_link_write_file(file);
}
@ -240,30 +236,38 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
if (err)
return err;
if (is_wb_truncate || dax_truncate) {
if (is_wb_truncate || dax_truncate)
inode_lock(inode);
fuse_set_nowrite(inode);
}
if (dax_truncate) {
down_write(&get_fuse_inode(inode)->i_mmap_sem);
err = fuse_dax_break_layouts(inode, 0, 0);
if (err)
goto out;
goto out_inode_unlock;
}
if (is_wb_truncate || dax_truncate)
fuse_set_nowrite(inode);
err = fuse_do_open(fm, get_node_id(inode), file, isdir);
if (!err)
fuse_finish_open(inode, file);
out:
if (is_wb_truncate || dax_truncate)
fuse_release_nowrite(inode);
if (!err) {
struct fuse_file *ff = file->private_data;
if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC))
truncate_pagecache(inode, 0);
else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
invalidate_inode_pages2(inode->i_mapping);
}
if (dax_truncate)
up_write(&get_fuse_inode(inode)->i_mmap_sem);
if (is_wb_truncate | dax_truncate) {
fuse_release_nowrite(inode);
out_inode_unlock:
if (is_wb_truncate || dax_truncate)
inode_unlock(inode);
}
return err;
}