overlayfs是一种叠加文件系统,在openwrt和安卓系统中都有很广泛的应用,overlayfs通常用于将只读根文件系统(rootfs)和可写文件系统(jffs2)进行叠加后形成一个新的文件系统,这个新的文件系统“看起来”是可读写的,这种做法的好处是:
- 对这个新文件系统的修改(删除也属于修改)都只保存在可写文件系统中,只读根文件系统不受任何影响
- 将可写文件系统格式化后,可以将整个文件系统恢复到初始状态(相当于只有只读根文件系统的状态)
- 减少flash擦写次数,延长设备使用寿命
下面就开始介绍openwrt系统中的overlayfs是如何挂载的,挂载过程可以分为2个部分:
- 只读根文件系统(rootfs)挂载过程
- overlayfs 挂载过程(包括可写文件系统(rootfs_data)挂载过程)
只读根文件系统(rootfs)挂载过程
kernel的启动流程大致如下:
prepare_namespace
prepare_namespace
负责根文件系统的挂载
void __init prepare_namespace(void)
{int is_floppy;if (root_delay) {printk(KERN_INFO "Waiting %d sec before mounting root device...\n",root_delay);ssleep(root_delay);}wait_for_device_probe(); // 等待所有的设备初始化完成md_run_setup();if (saved_root_name[0]) {root_device_name = saved_root_name;if (!strncmp(root_device_name, "mtd", 3) ||!strncmp(root_device_name, "ubi", 3)) {mount_block_root(root_device_name, root_mountflags);goto out;}ROOT_DEV = name_to_dev_t(root_device_name);if (strncmp(root_device_name, "/dev/", 5) == 0)root_device_name += 5;}if (initrd_load())goto out;/* wait for any asynchronous scanning to complete */if ((ROOT_DEV == 0) && root_wait) {printk(KERN_INFO "Waiting for root device %s...\n",saved_root_name);while (driver_probe_done() != 0 ||(ROOT_DEV = name_to_dev_t(saved_root_name)) == 0)msleep(5);async_synchronize_full();}is_floppy = MAJOR(ROOT_DEV) == FLOPPY_MAJOR;if (is_floppy && rd_doload && rd_load_disk(0))ROOT_DEV = Root_RAM0;mount_root();
out:devtmpfs_mount("dev");ksys_mount(".", "/", NULL, MS_MOVE, NULL);ksys_chroot(".");
}
- root_delay:如果cmdline中有“rootdelay=xxx”,则调用ssleep延迟xxx秒,例如“rootdelay=10”,则延时10s
- wait_for_device_probe()是等待所有的设备probe完成,
- saved_root_name:如果cmdline中有“root=xxx”,则saved_root_name=xxx。在我当前的系统中
root=/dev/mmcblk1p65
,所以saved_root_name=/dev/mmcblk1p65
- mount_block_root: 如果
root_device_name
前三个字符是“ubi”或者“mtd”,则调用mount_block_root
进行挂载根文件系统,这里主要是针对ubifs这一种文件系统进行特殊处理,因为ubifs根文件系统对应cmdline的参数一般是:ubi.mtd=1 root=ubi0:rootfs rootfstype=ubifs
,而其他文件系统一般是root=/dev/xxx
.saved_root_name
会被赋值给root_device_name
。 - ROOT_DEV:root_device_name设备对应的设备号(/dev/mmcblk1p65设备对应主设备号是259,次设备号是0)
- while (driver_probe_done() != 0): 等待所有
saved_root_name
设备probe完成(dev/xxx节点被创建) - mount_root:开始挂载根文件系统
mount_root(内核层)
void __init mount_root(void)
{
#ifdef CONFIG_ROOT_NFSif (ROOT_DEV == Root_NFS) {if (!mount_nfs_root())printk(KERN_ERR "VFS: Unable to mount root fs via NFS.\n");return;}
#endif
#ifdef CONFIG_CIFS_ROOTif (ROOT_DEV == Root_CIFS) {if (!mount_cifs_root())printk(KERN_ERR "VFS: Unable to mount root fs via SMB.\n");return;}
#endif
#ifdef CONFIG_MTD_ROOTFS_ROOT_DEVif (!mount_ubi_rootfs())return;
#endif
#ifdef CONFIG_BLOCK{int err = create_dev("/dev/root", ROOT_DEV);if (err < 0)pr_emerg("Failed to create /dev/root: %d\n", err);mount_block_root("/dev/root", root_mountflags);}
#endif
}
- CONFIG_MTD_ROOTFS_ROOT_DEV 的作用是告诉 Linux 内核在引导过程中从哪个 MTD 设备加载根文件系统,一般支持NANDFlash的设备都会开启这个选项
- mount_ubi_rootfs() 尝试挂载
ubifs
,如果挂载成功,则不再继续后续的挂载步骤。 - CONFIG_BLOCK是开启块设备子系统,对于绝大多数文件系统(EXT4、yaffs2、XFS、FAT等)都需要块设备子系统的支持。
- create_dev(“/dev/root”, ROOT_DEV) 创建了一个设备节点
/dev/root
,/dev/root
是根文件系统设备的抽象,因为它的设备号也是ROOT_DEV
,前面有提到ROOT_DEV
就是实际的根文件系统设备节点。这样做的好处是不用关心实际的根文件系统设备是什么,直接对/dev/root
进行mount
就可以实现根文件系统的挂载。 - mount_block_root(“/dev/root”, root_mountflags) 挂载
/dev/root
void __init mount_block_root(char *name, int flags)
{struct page *page = alloc_page(GFP_KERNEL);char *fs_names = page_address(page);char *p;char b[BDEVNAME_SIZE];scnprintf(b, BDEVNAME_SIZE, "unknown-block(%u,%u)",MAJOR(ROOT_DEV), MINOR(ROOT_DEV));get_fs_names(fs_names);
retry:for (p = fs_names; *p; p += strlen(p)+1) {int err = do_mount_root(name, p, flags, root_mount_data);switch (err) {case 0:goto out;case -EACCES:case -EINVAL:continue;}/** Allow the user to distinguish between failed sys_open* and bad superblock on root device.* and give them a list of the available devices*/printk("VFS: Cannot open root device \"%s\" or %s: error %d\n",root_device_name, b, err);printk("Please append a correct \"root=\" boot option; here are the available partitions:\n");printk_all_partitions();
#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVTprintk("DEBUG_BLOCK_EXT_DEVT is enabled, you need to specify ""explicit textual name for \"root=\" boot option.\n");
#endifpanic("VFS: Unable to mount root fs on %s", b);}if (!(flags & SB_RDONLY)) {flags |= SB_RDONLY;goto retry;}printk("List of all partitions:\n");printk_all_partitions();printk("No filesystem could mount root, tried: ");for (p = fs_names; *p; p += strlen(p)+1)printk(" %s", p);printk("\n");panic("VFS: Unable to mount root fs on %s", b);
out:put_page(page);
}
- get_fs_names(fs_names) 获取根文件系统类型并存入变量 fs_names,fs_names中可能保存了很多个文件系统类型,他们是用
,
隔开的。 - for (p = fs_names; *p; p += strlen§+1) 是依次尝试使用
fs_names
里面的文件系统类型进行挂载 - do_mount_root(name, p, flags, root_mount_data) 是具体挂载过程,此处
name=/dev/root
,一旦挂载成功则退出整个挂载步骤,不会再继续尝试其他文件系统类型。
static int __init do_mount_root(const char *name, const char *fs,const int flags, const void *data)
{struct super_block *s;struct page *p = NULL;char *data_page = NULL;int ret;if (data) {/* init_mount() requires a full page as fifth argument */p = alloc_page(GFP_KERNEL);if (!p)return -ENOMEM;data_page = page_address(p);/* zero-pad. init_mount() will make sure it's terminated */strncpy(data_page, data, PAGE_SIZE);}ret = init_mount(name, "/root", fs, flags, data_page);if (ret)goto out;init_chdir("/root");s = current->fs->pwd.dentry->d_sb;ROOT_DEV = s->s_dev;printk(KERN_INFO"VFS: Mounted root (%s filesystem)%s on device %u:%u.\n",s->s_type->name,sb_rdonly(s) ? " readonly" : "",MAJOR(ROOT_DEV), MINOR(ROOT_DEV));out:if (p)put_page(p);return ret;
}
- data 是上一步传入的
root_mount_data
变量,root_mount_data
是cmdline 中 “rootflags=xxx” 参数=
后面的部分,即挂载选项。如果不为空,则需要申请内存保存data
- init_mount(name, “/root”, fs, flags, data_page) 将
/dev/root
设备挂载到/root
- init_chdir(“/root”) 将当前工作路径改为
/root
- s = current->fs->pwd.dentry->d_sb 获取当前文件系统的超级块
- ROOT_DEV = s->s_dev 将超级块的设备号赋值给 ROOT_DEV
- 这时候可以看到如下内核打印,说明rootfs已经挂载成功
[ 2.337243] VFS: Mounted root (squashfs filesystem) readonly on device 259:0.
# device 259:0 对应设备
brw------- 1 root root 259, 0 Jan 1 1970 /dev/mmcblk1p65
overlayfs 挂载过程
在openwrt系统中,overlayfs 挂载需要使用 fstools工具。fstools并不是一个具体的tool,它包含了多个小工具,这些小工具都是运行在应用层的,这也说明了overlayfs是在应用层进行挂载的。
fstools
fstools
实际上包括如下小工具:
- jffs2reset
- mount_root
- libfstools.so
除此之外,下列工具也放在fstools
的包里面,它们是基于fstools
开发出来的拓展工具,而且如果需要安装下面的工具,除了
使能CONFIG_PACKAGE_fstools=y
之外,还需要使能对应的配置。
- block (CONFIG_PACKAGE_block-mount)
- blockd (CONFIG_PACKAGE_blockd)
- snapshot_tool (CONFIG_PACKAGE_snapshot-tool)
- ubi (CONFIG_PACKAGE_ubi-utils)
mount_root(应用层)
mount_root
就是用来挂载overlayfs
的工具,它支持4种模式:
- 默认(无参数):挂载overlayfs模式
- ram :挂载基于ram的overlayfs
- stop:获取
SHUTDOWN
环境变量状态 - done:挂载结束后置文件系统状态位
//fstools-2022-06-02-93369be0/mount_root.c
int main(int argc, char **argv)
{if (argc < 2)return start(argc, argv);if (!strcmp(argv[1], "ram"))return ramoverlay();if (!strcmp(argv[1], "stop"))return stop(argc, argv);if (!strcmp(argv[1], "done"))return done(argc, argv);return -1;
}
在oepnwrt系统运行第一个进程(1号进程)时,mount_root
就会被调用。可以看到此时mount_root
没有携带任何参数,所以它首先走的是start(argc, argv)
逻辑。
# openwrt/package/base-files/files/lib/preinit/80_mount_root
do_mount_root() {mount_rootboot_run_hook preinit_mount_root[ -f /sysupgrade.tgz -o -f /tmp/sysupgrade.tar ] && {echo "- config restore -"cp /etc/passwd /etc/group /etc/shadow /tmpcd /[ -f /sysupgrade.tgz ] && tar xzf /sysupgrade.tgz[ -f /tmp/sysupgrade.tar ] && tar xf /tmp/sysupgrade.tarmissing_lines /tmp/passwd /etc/passwd >> /etc/passwdmissing_lines /tmp/group /etc/group >> /etc/groupmissing_lines /tmp/shadow /etc/shadow >> /etc/shadowrm /tmp/passwd /tmp/group /tmp/shadow# Prevent configuration corruption on a power losssync}
}
mount_root->start
static int
start(int argc, char *argv[1])
{struct volume *root;struct volume *data = volume_find("rootfs_data");struct stat s;if (!getenv("PREINIT") && stat("/tmp/.preinit", &s))return -1;if (!data) {root = volume_find("rootfs");volume_init(root);ULOG_NOTE("mounting /dev/root\n");mount("/dev/root", "/", NULL, MS_NOATIME | MS_REMOUNT, 0);}/* Check for extroot config in rootfs before even trying rootfs_data */if (!mount_extroot("")) {ULOG_NOTE("switched to extroot\n");return 0;}/* There isn't extroot, so just try to mount "rootfs_data" */volume_init(data);switch (volume_identify(data)) {case FS_NONE:ULOG_WARN("no usable overlay filesystem found, using tmpfs overlay\n");return ramoverlay();case FS_DEADCODE:/** Filesystem isn't ready yet and we are in the preinit, so we* can't afford waiting for it. Use tmpfs for now and handle it* properly in the "done" call.*/ULOG_NOTE("jffs2 not ready yet, using temporary tmpfs overlay\n");return ramoverlay();case FS_EXT4:case FS_F2FS:case FS_JFFS2:case FS_UBIFS:mount_overlay(data);break;case FS_SNAPSHOT:mount_snapshot(data);break;}return 0;
}
- volume_find(“rootfs_data”) 查询分区name为
rootfs_data
的分区,这里最终调用的是volume>-driver->find()
,不同类型的文件系统会有不同的实现,find()
过程会初始化一个volume
对象,但volume
对象的成员信息有可能是不完整的,还需要在接下来的init()
环节继续填充完整。 - volume_init(data) 初始化
volume
对象,继续完善volume成员信息,这里最终调用的是volume>-driver->init()
- volume_identify(data) 识别volume指向的分区的文件系统类型,如果是
FS_EXT4
FS_F2FS
FS_JFFS2
FS_UBIFS
这4种文件系统之一,接下来就会执行挂载overlayfs流程,这里最终调用的是volume>-driver->identify()
int mount_overlay(struct volume *v)
{const char *overlay_mp = "/tmp/overlay";char *mp, *fs_name;int err;if (!v)return -1;mp = find_mount_point(v->blk, 0);if (mp) {ULOG_ERR("rootfs_data:%s is already mounted as %s\n", v->blk, mp);return -1;}err = overlay_mount_fs(v, overlay_mp);if (err)return err;/** Check for extroot config in overlay (rootfs_data) and if present then* prefer it over rootfs_data.*/if (!mount_extroot(overlay_mp)) {ULOG_INFO("switched to extroot\n");return 0;}switch (fs_state_get(overlay_mp)) {case FS_STATE_UNKNOWN:fs_state_set(overlay_mp, FS_STATE_PENDING);if (fs_state_get(overlay_mp) != FS_STATE_PENDING) {ULOG_ERR("unable to set filesystem state\n");break;}case FS_STATE_PENDING:ULOG_INFO("overlay filesystem has not been fully initialized yet\n");overlay_delete(overlay_mp, true);break;case FS_STATE_READY:break;}fs_name = overlay_fs_name(volume_identify(v));ULOG_INFO("switching to %s overlay\n", fs_name);if (mount_move("/tmp", "", "/overlay") || fopivot("/overlay", "/rom")) {ULOG_ERR("switching to %s failed - fallback to ramoverlay\n", fs_name);return ramoverlay();}return -1;
}
- const char *overlay_mp = “/tmp/overlay” 定义临时挂载点
- find_mount_point 查找
rootfs_data
分区的挂载点,主要目的是为了确认rootfs_data
是否已经挂载,如果未挂载,则继续执行 - overlay_mount_fs 将
rootfs_data
分区挂载到/tmp/overlay
目录 - overlay_fs_name(volume_identify(v)) 获取
rootfs_data
分区文件系统的类型 - mount_move(“/tmp”, “”, “/overlay”) 将挂载点
/tmp/overlay
迁移至/overlay
, - fopivot(“/overlay”, “/rom”) 挂载
overlayfs
,可写文件系统的挂载点为/overlay
,/rom
此时还只是一个普通文件夹
int fopivot(char *rw_root, char *ro_root)
{char overlay[64], mount_options[64], upperdir[64], workdir[64], upgrade[64], upgrade_dest[64];struct stat st;if (find_filesystem("overlay")) {ULOG_ERR("BUG: no suitable fs found\n");return -1;}snprintf(overlay, sizeof(overlay), "overlayfs:%s", rw_root);snprintf(upperdir, sizeof(upperdir), "%s/upper", rw_root);snprintf(workdir, sizeof(workdir), "%s/work", rw_root);snprintf(upgrade, sizeof(upgrade), "%s/sysupgrade.tgz", rw_root);snprintf(upgrade_dest, sizeof(upgrade_dest), "%s/sysupgrade.tgz", upperdir);snprintf(mount_options, sizeof(mount_options), "lowerdir=/,upperdir=%s,workdir=%s",upperdir, workdir);/** Initialize SELinux security label on newly created overlay* filesystem where /upper doesn't yet exist*/if (stat(upperdir, &st))selinux_restorecon(rw_root);/** Overlay FS v23 and later requires both a upper and* a work directory, both on the same filesystem, but* not part of the same subtree.* We can't really deal with these constraints without* creating two new subdirectories in /overlay.*/if (mkdir(upperdir, 0755) == -1 && errno != EEXIST)return -1;if (mkdir(workdir, 0755) == -1 && errno != EEXIST)return -1;if (stat(upgrade, &st) == 0)rename(upgrade, upgrade_dest);if (mount(overlay, "/mnt", "overlay", MS_NOATIME, mount_options)) {ULOG_ERR("mount failed: %m, options %s\n", mount_options);return -1;}return pivot("/mnt", ro_root);
}
- find_filesystem(“overlay”) 判断当前内核是否支持
overlayfs
- snprintf(mount_options,xxx) 设置
overlayfs
的挂载选项 - mount(overlay, “/mnt”, “overlay”, MS_NOATIME, mount_options) 将overlayfs挂载到
/mnt
目录 - pivot(“/mnt”, ro_root) 这一步操作比较复杂,它主要做了2件事情:1.将当前进程的
/
重新挂载到ro_root
目录,也就是/rom
2.将/mnt
重新挂载为新的/
,因为上一步中overlayfs挂载到/mnt
,所以这一步结果是 overlayfs挂载到/
。
最终的效果如下:
$ mount
/dev/root on /rom type squashfs (ro,relatime) # 只读文件系统
/dev/mmcblk1p66 on /overlay type ext4 (rw,noatime) # 可写文件系统
overlayfs:/overlay on / type overlay (rw,noatime,lowerdir=/,upperdir=/overlay/upper,workdir=/overlay/work) #overlayfs
有关volume
和 driver
相关说明如下:
- volume
volume
用于描述一个分区(块设备),但与ubifs
中的volume
不是一个概念,要注意区分。
这里的volume
包含如下信息:
enum {UNKNOWN_TYPE,NANDFLASH,NORFLASH,UBIVOLUME,BLOCKDEV,
};struct volume {struct driver *drv;//分区对应driverchar *name;//分区名 char *blk;//分区对应的设备节点,dev/xxx__u64 size;//分区大小__u32 block_size;//块大小int type;//NANDFLASH/NORFLASH/UBIVOLUME
};
- driver
driver
是操作volume
的驱动,包括初始化、查找、读写、擦除等操作,这些操作与具体的文件系统有关,因此不同的文件系统会对应不同的driver。
typedef int (*volume_probe_t)(void);
typedef int (*volume_init_t)(struct volume *v);
typedef void (*volume_stop_t)(struct volume *v);
typedef struct volume *(*volume_find_t)(char *name);
typedef int (*volume_identify_t)(struct volume *v);
typedef int (*volume_read_t)(struct volume *v, void *buf, int offset, int length);
typedef int (*volume_write_t)(struct volume *v, void *buf, int offset, int length);
typedef int (*volume_erase_t)(struct volume *v, int start, int len);
typedef int (*volume_erase_all_t)(struct volume *v);struct driver {struct list_head list;//用于将多个不同的driver挂接在一起char *name;//驱动名volume_probe_t probe;volume_init_t init;//volume_stop_t stop;volume_find_t find;volume_identify_t identify;volume_read_t read;volume_write_t write;volume_erase_t erase;volume_erase_all_t erase_all;
};
mount_root->ram
int
ramoverlay(void)
{mkdir("/tmp/root", 0755);mount("tmpfs", "/tmp/root", "tmpfs", MS_NOATIME, "mode=0755");return fopivot("/tmp/root", "/rom");
}
- mount(“tmpfs”, “/tmp/root”, “tmpfs”, MS_NOATIME, “mode=0755”) 挂载
tmpfs
- fopivot(“/tmp/root”, “/rom”) 将当前进程的
/
重新挂载到/rom
目录 然后将/tmp/root
重新挂载为新的/
,这种overlayfs的可写部分是基于RAM的文件系统,所有的修改掉电后会丢失。
mount_root->stop
static int
stop(int argc, char *argv[1])
{if (!getenv("SHUTDOWN"))return -1;return 0;
}
- getenv(“SHUTDOWN”) 获取
SHUTDOWN
环境变量并返回结果码
mount_root->done
# openwrt/package/base-files/files/etc/init.d/done
#!/bin/sh /etc/rc.common
# Copyright (C) 2006 OpenWrt.orgSTART=95
boot() {mount_root donerm -f /sysupgrade.tgz && sync# process user commands[ -f /etc/rc.local ] && {sh /etc/rc.local}# set leds to normal state. /etc/diag.shset_state done
}
- mount_root done 会在openwrt所有服务启动的尾声被调用
static int
done(int argc, char *argv[1])
{struct volume *v = volume_find("rootfs_data");if (!v)return -1;switch (volume_identify(v)) {case FS_NONE:case FS_DEADCODE:return jffs2_switch(v);case FS_EXT4:case FS_F2FS:case FS_JFFS2:case FS_UBIFS:fs_state_set("/overlay", FS_STATE_READY);break;}return 0;
}
- volume_find(“rootfs_data”) 查询分区name为
rootfs_data
的分区,返回对应的volume
对象 - volume_identify(v) 识别
rootfs_data
分区的文件系统类型,如果是FS_EXT4
FS_F2FS
FS_JFFS2
FS_UBIFS
这4种文件系统之一,会设置文件系统状态为FS_STATE_READY
,表示overlayfs
挂载完成。
参考
核心的進入點: start_kernel()
Linux内核源码分析-安装实际根文件系统- prepare_namespace
内核启动之start_kernel()和rest_init()函数
/dev/root