summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--tools/virtiofsd/fuse_lowlevel.h1
-rw-r--r--tools/virtiofsd/helper.c47
-rw-r--r--tools/virtiofsd/passthrough_ll.c102
3 files changed, 133 insertions, 17 deletions
diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h
index 8f6d705b5c..562fd5241e 100644
--- a/tools/virtiofsd/fuse_lowlevel.h
+++ b/tools/virtiofsd/fuse_lowlevel.h
@@ -1777,6 +1777,7 @@ struct fuse_cmdline_opts {
     int syslog;
     int log_level;
     unsigned int max_idle_threads;
+    unsigned long rlimit_nofile;
 };
 
 /**
diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c
index 819c2bc13c..00a1ef666a 100644
--- a/tools/virtiofsd/helper.c
+++ b/tools/virtiofsd/helper.c
@@ -23,6 +23,8 @@
 #include <stdlib.h>
 #include <string.h>
 #include <sys/param.h>
+#include <sys/time.h>
+#include <sys/resource.h>
 #include <unistd.h>
 
 #define FUSE_HELPER_OPT(t, p)                       \
@@ -53,6 +55,7 @@ static const struct fuse_opt fuse_helper_opts[] = {
     FUSE_HELPER_OPT("subtype=", nodefault_subtype),
     FUSE_OPT_KEY("subtype=", FUSE_OPT_KEY_KEEP),
     FUSE_HELPER_OPT("max_idle_threads=%u", max_idle_threads),
+    FUSE_HELPER_OPT("--rlimit-nofile=%lu", rlimit_nofile),
     FUSE_HELPER_OPT("--syslog", syslog),
     FUSE_HELPER_OPT_VALUE("log_level=debug", log_level, FUSE_LOG_DEBUG),
     FUSE_HELPER_OPT_VALUE("log_level=info", log_level, FUSE_LOG_INFO),
@@ -171,6 +174,10 @@ void fuse_cmdline_help(void)
            "                               default: no_writeback\n"
            "    -o xattr|no_xattr          enable/disable xattr\n"
            "                               default: no_xattr\n"
+           "    --rlimit-nofile=<num>      set maximum number of file descriptors\n"
+           "                               (0 leaves rlimit unchanged)\n"
+           "                               default: min(1000000, fs.file-max - 16384)\n"
+           "                                        if the current rlimit is lower\n"
            );
 }
 
@@ -191,11 +198,51 @@ static int fuse_helper_opt_proc(void *data, const char *arg, int key,
     }
 }
 
+static unsigned long get_default_rlimit_nofile(void)
+{
+    g_autofree gchar *file_max_str = NULL;
+    const rlim_t reserved_fds = 16384; /* leave at least this many fds free */
+    rlim_t max_fds = 1000000; /* our default RLIMIT_NOFILE target */
+    rlim_t file_max;
+    struct rlimit rlim;
+
+    /*
+     * Reduce max_fds below the system-wide maximum, if necessary.  This
+     * ensures there are fds available for other processes so we don't
+     * cause resource exhaustion.
+     */
+    if (!g_file_get_contents("/proc/sys/fs/file-max", &file_max_str,
+                             NULL, NULL)) {
+        fuse_log(FUSE_LOG_ERR, "can't read /proc/sys/fs/file-max\n");
+        exit(1);
+    }
+    file_max = g_ascii_strtoull(file_max_str, NULL, 10);
+    if (file_max < 2 * reserved_fds) {
+        fuse_log(FUSE_LOG_ERR,
+                 "The fs.file-max sysctl is too low (%lu) to allow a "
+                 "reasonable number of open files.\n",
+                 (unsigned long)file_max);
+        exit(1);
+    }
+    max_fds = MIN(file_max - reserved_fds, max_fds);
+
+    if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) {
+        fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n");
+        exit(1);
+    }
+
+    if (rlim.rlim_cur >= max_fds) {
+        return 0; /* we have more fds available than required! */
+    }
+    return max_fds;
+}
+
 int fuse_parse_cmdline(struct fuse_args *args, struct fuse_cmdline_opts *opts)
 {
     memset(opts, 0, sizeof(struct fuse_cmdline_opts));
 
     opts->max_idle_threads = 10;
+    opts->rlimit_nofile = get_default_rlimit_nofile();
     opts->foreground = 1;
 
     if (fuse_opt_parse(args, opts, fuse_helper_opts, fuse_helper_opt_proc) ==
diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
index 4c35c95b25..3ba1d90984 100644
--- a/tools/virtiofsd/passthrough_ll.c
+++ b/tools/virtiofsd/passthrough_ll.c
@@ -2531,11 +2531,24 @@ static void print_capabilities(void)
 }
 
 /*
+ * Drop all Linux capabilities because the wait parent process only needs to
+ * sit in waitpid(2) and terminate.
+ */
+static void setup_wait_parent_capabilities(void)
+{
+    capng_setpid(syscall(SYS_gettid));
+    capng_clear(CAPNG_SELECT_BOTH);
+    capng_apply(CAPNG_SELECT_BOTH);
+}
+
+/*
  * Move to a new mount, net, and pid namespaces to isolate this process.
  */
 static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
 {
     pid_t child;
+    char template[] = "virtiofsd-XXXXXX";
+    char *tmpdir;
 
     /*
      * Create a new pid namespace for *child* processes.  We'll have to
@@ -2561,6 +2574,8 @@ static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
         pid_t waited;
         int wstatus;
 
+        setup_wait_parent_capabilities();
+
         /* The parent waits for the child */
         do {
             waited = waitpid(child, &wstatus, 0);
@@ -2597,12 +2612,33 @@ static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
         exit(1);
     }
 
+    tmpdir = mkdtemp(template);
+    if (!tmpdir) {
+        fuse_log(FUSE_LOG_ERR, "tmpdir(%s): %m\n", template);
+        exit(1);
+    }
+
+    if (mount("/proc/self/fd", tmpdir, NULL, MS_BIND, NULL) < 0) {
+        fuse_log(FUSE_LOG_ERR, "mount(/proc/self/fd, %s, MS_BIND): %m\n",
+                 tmpdir);
+        exit(1);
+    }
+
     /* Now we can get our /proc/self/fd directory file descriptor */
-    lo->proc_self_fd = open("/proc/self/fd", O_PATH);
+    lo->proc_self_fd = open(tmpdir, O_PATH);
     if (lo->proc_self_fd == -1) {
-        fuse_log(FUSE_LOG_ERR, "open(/proc/self/fd, O_PATH): %m\n");
+        fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", tmpdir);
         exit(1);
     }
+
+    if (umount2(tmpdir, MNT_DETACH) < 0) {
+        fuse_log(FUSE_LOG_ERR, "umount2(%s, MNT_DETACH): %m\n", tmpdir);
+        exit(1);
+    }
+
+    if (rmdir(tmpdir) < 0) {
+        fuse_log(FUSE_LOG_ERR, "rmdir(%s): %m\n", tmpdir);
+    }
 }
 
 /*
@@ -2643,7 +2679,7 @@ static void setup_mounts(const char *source)
     int oldroot;
     int newroot;
 
-    if (mount(source, source, NULL, MS_BIND, NULL) < 0) {
+    if (mount(source, source, NULL, MS_BIND | MS_REC, NULL) < 0) {
         fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source);
         exit(1);
     }
@@ -2696,6 +2732,43 @@ static void setup_mounts(const char *source)
 }
 
 /*
+ * Only keep whitelisted capabilities that are needed for file system operation
+ */
+static void setup_capabilities(void)
+{
+    pthread_mutex_lock(&cap.mutex);
+    capng_restore_state(&cap.saved);
+
+    /*
+     * Whitelist file system-related capabilities that are needed for a file
+     * server to act like root.  Drop everything else like networking and
+     * sysadmin capabilities.
+     *
+     * Exclusions:
+     * 1. CAP_LINUX_IMMUTABLE is not included because it's only used via ioctl
+     *    and we don't support that.
+     * 2. CAP_MAC_OVERRIDE is not included because it only seems to be
+     *    used by the Smack LSM.  Omit it until there is demand for it.
+     */
+    capng_setpid(syscall(SYS_gettid));
+    capng_clear(CAPNG_SELECT_BOTH);
+    capng_updatev(CAPNG_ADD, CAPNG_PERMITTED | CAPNG_EFFECTIVE,
+            CAP_CHOWN,
+            CAP_DAC_OVERRIDE,
+            CAP_DAC_READ_SEARCH,
+            CAP_FOWNER,
+            CAP_FSETID,
+            CAP_SETGID,
+            CAP_SETUID,
+            CAP_MKNOD,
+            CAP_SETFCAP);
+    capng_apply(CAPNG_SELECT_BOTH);
+
+    cap.saved = capng_save_state();
+    pthread_mutex_unlock(&cap.mutex);
+}
+
+/*
  * Lock down this process to prevent access to other processes or files outside
  * source directory.  This reduces the impact of arbitrary code execution bugs.
  */
@@ -2705,26 +2778,21 @@ static void setup_sandbox(struct lo_data *lo, struct fuse_session *se,
     setup_namespaces(lo, se);
     setup_mounts(lo->source);
     setup_seccomp(enable_syslog);
+    setup_capabilities();
 }
 
-/* Raise the maximum number of open file descriptors */
-static void setup_nofile_rlimit(void)
+/* Set the maximum number of open file descriptors */
+static void setup_nofile_rlimit(unsigned long rlimit_nofile)
 {
-    const rlim_t max_fds = 1000000;
-    struct rlimit rlim;
-
-    if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) {
-        fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n");
-        exit(1);
-    }
+    struct rlimit rlim = {
+        .rlim_cur = rlimit_nofile,
+        .rlim_max = rlimit_nofile,
+    };
 
-    if (rlim.rlim_cur >= max_fds) {
+    if (rlimit_nofile == 0) {
         return; /* nothing to do */
     }
 
-    rlim.rlim_cur = max_fds;
-    rlim.rlim_max = max_fds;
-
     if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) {
         /* Ignore SELinux denials */
         if (errno == EPERM) {
@@ -2977,7 +3045,7 @@ int main(int argc, char *argv[])
 
     fuse_daemonize(opts.foreground);
 
-    setup_nofile_rlimit();
+    setup_nofile_rlimit(opts.rlimit_nofile);
 
     /* Must be before sandbox since it wants /proc */
     setup_capng();