Hey,

In user namespaces and why some containers might take a while to start in Concourse, I described how one can use clone(2) with the CLONE_NEWUSER flag to create a new user namespace, and then writing to /proc/pid/uid_map to configure the mapping betweeen users within that namespace and outside of it, however, I didn't go into the details of how that “write to /proc/pid/uid_map really works under the hood.

This article expands on that.

what's that, again?

Let's first remind ourselves what that file is all about:

After the creation of a new user namespace, the uid_map file of one of the processes in the namespace may be written to once to define the mapping of user IDs in the new user namespace.

from user_namespaces(7).

The format is as follows:

    ID-inside-ns ID-outside-ns length
                                 |
                                 |
                    starting from `ID-inside-ns`, 
                    how many more can be added (sequentially)

For instance:

    0  1000  1

Meaning “One ID, 0, inside the NS, maps to ID 1000 in outer NS”.

This way, such file let's us to both:

We can see that in practice using unshare(1):

    # display the credentals n the current shell
    #
    ns1 $ id

            uid=1001(ubuntu) gid=1002(ubuntu)



    # unshares the user namespace, runs the program only after the current
    # effective user and group IDs have been mapped to the superuser UID
    # and GID in the newly created user namespace.
    #
    ns1 $ unshare -U -r bash



    # check what ouur uid and gid are now that we're within this new user
    # namespace
    #
    ns2 $ id

            uid=0(root) gid=0(root)



    # take a look at uid_map, see the mapping between "here and there"
    #
    ns2 $ cat /proc/self/uid_map

             0       1001          1



    # from the outside though, we can see that this process is really just
    # an unprivileged one:
    #
    ns1 $ cat /proc/$ns2_pid/status

            Uid:    1001    1001    1001    1001
            Gid:    1002    1002    1002    1002

Being /proc a pseudo-filesystem, backed by some methods that query and/or set in-memory state, we can search around the kernel source code and find the methods that back the implentation of read(2)s and write(2)s against that uid_map file.

writing to /proc/pid/uid_map

Tracing the entire call graph for a vfs_write on /proc/self/uid_map, we can see that it goes all the way down to proc_uid_map_write, which then calls map_write, our function of interest.

    do_syscall_64() {
      __x64_sys_write() {
        ksys_write() {
          vfs_write() {
            __vfs_write() {
              proc_uid_map_write() {
                map_write() {
                  file_ns_capable() {
                    security_capable();
                  }
                  map_id_range_down();
                  map_id_range_down();
                }
              }
            }
          }
        }
      }
    }

There is where the trick really happens:

  1. it parses the contents written by the write(2) syscalls, unmarshalling that all into a struct uid_gid_map

     struct uid_gid_map { /* 64 bytes -- 1 cache line */
             u32 nr_extents;
             union {
                     struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS];
                     struct {
                             struct uid_gid_extent *forward;
                             struct uid_gid_extent *reverse;
                     };
             };
     };
    
     struct uid_gid_extent {
             u32 first;
             u32 lower_first;
             u32 count;
     };
    
  2. configures the task's user namespace to leverage that mapping (rather than the original one that it had at the time).

We can see that with a modified code that shows the most common parts of that:

    static ssize_t
    map_write(struct file* file,
              const char __user*  buf,
              size_t              count,
              loff_t*             ppos,
              int                 cap_setid,
              struct uid_gid_map* map,
              struct uid_gid_map* parent_map)
    {

            // initialize a temporary uid_gid_map structure
            //
            struct uid_gid_map     new_map;
            memset(&new_map, 0, sizeof(struct uid_gid_map));


            // for each entry, convert the uid that was supplied by the user
            // as the "uid in the outside" to  the kernel-view of  that uid
            // in the parent user namespace.
            //
            for (idx = 0; idx < new_map.nr_extents; idx++) {
                    struct uid_gid_extent* e = &new_map.extent[idx];

                    e->lower_first = map_id_range_down(
                            parent_map, 
                            e->lower_first, 
                            e->count);
            }

            // set the extent that in the namespace.
            //
            memcpy(map->extent,
                   new_map.extent,
                   new_map.nr_extents * sizeof(new_map.extent[0]));
            map->nr_extents = new_map.nr_extents;

            return ret;
    }

Once that's done, the task's usernamespace uid mapping has been configuring.

reading from /proc/pid/uid_map

Reading is pretty much the same, except that it takes a read-only route

    vfs_open() {
      do_dentry_open() {
        path_get();
        try_module_get();
        security_file_open();
        proc_uid_map_open();        <<
        file_ra_state_init();
      }
    }

    vfs_read() {
      __vfs_read() {
        seq_read() {
          uid_m_start();
          uid_m_show() {            <<
            map_id_up();
            seq_printf() {
              seq_vprintf();
            }
          }
        }
      }
    }

fetching the user namespace associated wth the task pointed by the pid at open time, making that available for the further reads, and then during the reads, givng back the information associated with the user namespace.

    static int
    proc_id_map_open(struct inode*                inode,
                     struct file*                 file,
                     const struct seq_operations* seq_ops)
    {
            struct user_namespace* ns = NULL;
            struct task_struct*    task;
            struct seq_file*       seq;

            // retrieve the task associated w/ the pid
            //
            task = get_proc_task(inode);

            // get the user namespace
            //
            ns = get_user_ns(task_cred_xxx(task, user_ns));

            seq_open(file, seq_ops);

            seq          = file->private_data;

            // let further reads know about the user namespace
            //
            seq->private = ns;

            return 0;
    }

This way, when performing the reads, it can go through the extents that set for that usernamespace, performing the proper UID translations according to who's reading that file and then formatting the string accordingly.

    static int
    uid_m_show(struct seq_file* seq, void* v)
    {
            struct user_namespace* ns     = seq->private;
            struct uid_gid_extent* extent = v;
            struct user_namespace* lower_ns;
            uid_t                  lower;

            // perform the proper transformations according to who's reading it
            //
            lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first));

            // display
            //
            seq_printf(seq, "%10u %10u %10u\n", 
                    extent->first, lower, extent->count);

            return 0;
    }