1、前言
想要了解EXT文件系统的工作原理,那了解文件系统在磁盘上的分布就是必不可少的。这一节主要介绍EXT文件系统硬盘存储的物理结构。
由于当前主流的CPU架构均采用小端模式,因此下文介绍均已小端模式为准。
2、超级块
2.1 属性
下表列举出超级块中相对重要的属性。
属性名 | 含义 |
---|---|
s_log_block_size | 块大小,计算公式 = 2 ^ (10 + s_log_block_size) |
s_blocks_per_group | 每个块组中块的个数 |
s_inodes_per_group | 每个块组中索引的个数 |
s_magic | 魔数(0xEF53) |
s_inode_size | 索引大小,单位:byte |
s_feature_compat | 兼容特性 |
s_feature_incompat | 不兼容特性 |
s_feature_ro_compat | 只读兼容特性 |
s_backup_bgs | 包含超级块备份的块组号。 |
s_desc_size | 块组描述符大小 |
2.2 特性
一些默认开启或者常用的文件系统特性。
属性名 | 含义 |
---|---|
COMPAT_HAS_JOURNAL | 开启日志。 |
COMPAT_EXT_ATTR | 支持扩展属性。 |
COMPAT_RESIZE_INODE | 保留块组描述符。需要开启RO_COMPAT_SPARSE_SUPER特性。 |
COMPAT_SPARSE_SUPER2 | 稀疏超级块V2。开启本特性后,仅s_backup_bgs 属性指向的2个块组备份超级块。 |
INCOMPAT_FILETYPE | app_ext4_dir_entry结构中包含文件类型。 |
INCOMPAT_META_BG | 开启元块组属性。与COMPAT_RESIZE_INODE特性互斥。 |
INCOMPAT_64BIT | 支持超过2^32个块。 |
INCOMPAT_FLEX_BG | 开启弹性块组。 |
INCOMPAT_INLINE_DATA | 支持内联文件和目录。 |
RO_COMPAT_SPARSE_SUPER | 稀疏超级块。 |
2.3 参考代码
typedef struct {ub32 s_inodes_count; /* Inodes count */ub32 s_blocks_count; /* Blocks count */ub32 s_r_blocks_count; /* Reserved blocks count */ub32 s_free_blocks_count; /* Free blocks count */ub32 s_free_inodes_count; /* Free inodes count */ub32 s_first_data_block; /* First Data Block */ub32 s_log_block_size; /* Block size */ub32 s_log_cluster_size; /* Allocation cluster size */ub32 s_blocks_per_group; /* # Blocks per group */ub32 s_clusters_per_group; /* # Fragments per group */ub32 s_inodes_per_group; /* # Inodes per group */ub32 s_mtime; /* Mount time */ub32 s_wtime; /* Write time */ub16 s_mnt_count; /* Mount count */ub16 s_max_mnt_count; /* Maximal mount count */ub16 s_magic; /* Magic signature */ub16 s_state; /* File system state */ub16 s_errors; /* Behaviour when detecting errors */ub16 s_minor_rev_level; /* minor revision level */ub32 s_lastcheck; /* time of last check */ub32 s_checkinterval; /* max. time between checks */ub32 s_creator_os; /* OS */ub32 s_rev_level; /* Revision level */ub16 s_def_resuid; /* Default uid for reserved blocks */ub16 s_def_resgid; /* Default gid for reserved blocks *//** These fields are for EXT2_DYNAMIC_REV superblocks only.** Note: the difference between the compatible feature set and* the incompatible feature set is that if there is a bit set* in the incompatible feature set that the kernel doesn't* know about, it should refuse to mount the filesystem.** e2fsck's requirements are more strict; if it doesn't know* about a feature in either the compatible or incompatible* feature set, it must abort and not try to meddle with* things it doesn't understand...*/ub32 s_first_ino; /* First non-reserved inode */ub16 s_inode_size; /* size of inode structure */ub16 s_block_group_nr; /* block group # of this superblock */ub32 s_feature_compat; /* compatible feature set */ub32 s_feature_incompat; /* incompatible feature set */ub32 s_feature_ro_compat; /* readonly-compatible feature set */ub8 s_uuid[16]; /* 128-bit uuid for volume */b8 s_volume_name[16]; /* volume name */b8 s_last_mounted[64]; /* directory where last mounted */ub32 s_algorithm_usage_bitmap; /* For compression *//** Performance hints. Directory preallocation should only* happen if the EXT2_FEATURE_COMPAT_DIR_PREALLOC flag is on.*/ub8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ub8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ub16 s_reserved_gdt_blocks; /* Per group table for online growth *//** Journaling support valid if EXT2_FEATURE_COMPAT_HAS_JOURNAL set.*/ub8 s_journal_uuid[16]; /* uuid of journal superblock */ub32 s_journal_inum; /* inode number of journal file */ub32 s_journal_dev; /* device number of journal file */ub32 s_last_orphan; /* start of list of inodes to delete */ub32 s_hash_seed[4]; /* HTREE hash seed */ub8 s_def_hash_version; /* Default hash version to use */ub8 s_jnl_backup_type; /* Default type of journal backup */ub16 s_desc_size; /* Group desc. size: INCOMPAT_64BIT */ub32 s_default_mount_opts;ub32 s_first_meta_bg; /* First metablock group */ub32 s_mkfs_time; /* When the filesystem was created */ub32 s_jnl_blocks[17]; /* Backup of the journal inode */ub32 s_blocks_count_hi; /* Blocks count high 32bits */ub32 s_r_blocks_count_hi; /* Reserved blocks count high 32 bits*/ub32 s_free_blocks_hi; /* Free blocks count */ub16 s_min_extra_isize; /* All inodes have at least # bytes */ub16 s_want_extra_isize; /* New inodes should reserve # bytes */ub32 s_flags; /* Miscellaneous flags */ub16 s_raid_stride; /* RAID stride */ub16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ub64 s_mmp_block; /* Block for multi-mount protection */ub32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ub8 s_log_groups_per_flex; /* FLEX_BG group size */ub8 s_reserved_char_pad;ub16 s_reserved_pad; /* Padding to next 32bits */ub64 s_kbytes_written; /* nr of lifetime kilobytes written */ub32 s_snapshot_inum; /* Inode number of active snapshot */ub32 s_snapshot_id; /* sequential ID of active snapshot */ub64 s_snapshot_r_blocks_count; /* reserved blocks for activesnapshot's future use */ub32 s_snapshot_list; /* inode number of the head of the on-disk snapshotlist */ub32 s_error_count; /* number of fs errors */ub32 s_first_error_time; /* first time an error happened */ub32 s_first_error_ino; /* inode involved in first error */ub64 s_first_error_block; /* block involved of first error */ub8 s_first_error_func[32]; /* function where the error happened */ub32 s_first_error_line; /* line number where error happened */ub32 s_last_error_time; /* most recent time of an error */ub32 s_last_error_ino; /* inode involved in last error */ub32 s_last_error_line; /* line number where error happened */ub64 s_last_error_block; /* block involved of last error */ub8 s_last_error_func[32]; /* function where the error happened */ub8 s_mount_opts[64];ub32 s_usr_quota_inum; /* inode number of user quota file */ub32 s_grp_quota_inum; /* inode number of group quota file */ub32 s_overhead_blocks; /* overhead blocks/clusters in fs */ub32 s_backup_bgs[2]; /* If sparse_super2 enabled */ub32 s_reserved[106]; /* Padding to the end of the block */ub32 s_checksum; /* crc32c(superblock) */
} app_ext4_super_block;
3、组描述符
3.1 属性
下表列举出组描述符的关键属性。
属性名 | 含义 |
---|---|
bg_inode_table | 索引表的物理偏移。 |
bg_inode_table_hi | 索引表的物理偏移的高32位。 |
3.2 索引表计算
已知目标文件的Inode = 357,每个块组的Inode数 inode_count_ = 8192, 组描述大小 gdt_size_ = 32,索引Inode大小 inode_size_ = 256,该如何找到文件对应的组描述符呢?
首先,计算出文件所在的块组,bg_no = (inode_no - 1) / inode_count_ = 356 / 8192 = 0, 即文件属于第一个块组。
接着,计算文件所在的组描述符的位置,gdt_block_no = bg_no / gdt_count_ = 0 / (4096 / 32) = 0,即文件所在的组描述符在块组文件描述符的第一个块中。
然后,计算文件所在的组描述符在块中的位置,gdt_index = bg_no % gdt_count_ = 0,块中的第一个组描述符即文件所在的组描述符。
其次,计算文件在所在块组中的索引, inode_partition = (inode_no - 1) % inode_count_ = 356 % 8192 = 356, 即文件是块组的第356个inode节点。
再次,计算文件在索引表中的位置,inode_block_no = inode_partition / it_inode_count = 356 / (4096 / 256) = 22, 即文件所在的索引在索引表的第22个块中。
最后,从组描述的bg_inode_table和bg_inode_table_hi获取inode_table_no,计算出索引表的偏移位置file_offset = (inode_table_no + inode_block_no) * 4096。
默认情况下,所有的组描述符在第一个块组中都存在备份,因此从第一个块组中读取对应的组描述符即可。
// inode 0 is defined but not exist, so actual inode no begin with 1.
// the bg number of the inode_no
b32 bg_no = (inode_no - 1) / volume_->inode_count_;
// the gdt number in bg
b32 gdt_block_no = bg_no / volume_->gdt_count_;
// the index of gdt in the bg which this inode in
b32 gdt_index = bg_no % volume_->gdt_count_;
// the index of inode in the bg which this inode in
b32 inode_partition = (inode_no - 1) % volume_->inode_count_;
// the inode count in one IT block
b32 it_inode_count = volume_->block_size_ / volume_->inode_size_;
// the index of IT block in the bg which this inode in
b32 inode_block_no = inode_partition / it_inode_count;
// move file pointer to gdt blockb64 file_offset = 0;
if (volume_->meta_group_)file_offset = GetGDTOffset(gdt_block_no * (b64)volume_->gdt_count_);
else// use gdt in first bgfile_offset = GetGDTOffset(0) + gdt_block_no * (b64)volume_->block_size_;
if (lseek64(volume_->fd_, file_offset, SEEK_SET) != file_offset) goto IOErr;gdt_record_ = (app_ext4_group_desc *)new char[volume_->block_size_];
if (volume_->block_size_ !=read(volume_->fd_, gdt_record_, volume_->block_size_))goto IOErr;// get offset of block which inode in
if (!volume_->extend64_) {file_offset = (gdt_record_[gdt_index].bg_inode_table + inode_block_no) *(b64)volume_->block_size_;
} else {app_ext4_group_desc64 *gdt_record =(app_ext4_group_desc64 *)((char *)gdt_record_.get() +volume_->gdt_size_ * gdt_index);b64 inode_table_no =gdt_record->bg_inode_table | ((b64)gdt_record->bg_inode_table_hi << 32);file_offset = (inode_table_no + inode_block_no) * volume_->block_size_;
}
if (lseek64(volume_->fd_, file_offset, SEEK_SET) != file_offset) goto IOErr;inode_record_ = (app_ext4_inode *)new char[volume_->block_size_];
if (volume_->block_size_ !=read(volume_->fd_, inode_record_, volume_->block_size_))goto IOErr;
3.3 参考代码
typedef struct {ub32 bg_block_bitmap; /* Blocks bitmap block */ub32 bg_inode_bitmap; /* Inodes bitmap block */ub32 bg_inode_table; /* Inodes table block */ub16 bg_free_blocks_count; /* Free blocks count */ub16 bg_free_inodes_count; /* Free inodes count */ub16 bg_used_dirs_count; /* Directories count */ub16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ub32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ub16 bg_block_bitmap_csum_lo; /* crc32c(s_uuid+grp_num+bitmap) LSB */ub16 bg_inode_bitmap_csum_lo; /* crc32c(s_uuid+grp_num+bitmap) LSB */ub16 bg_itable_unused; /* Unused inodes count */ub16 bg_checksum; /* crc16(sb_uuid+group+desc) */
} app_ext4_group_desc;typedef struct {ub32 bg_block_bitmap; /* Blocks bitmap block */ub32 bg_inode_bitmap; /* Inodes bitmap block */ub32 bg_inode_table; /* Inodes table block */ub16 bg_free_blocks_count; /* Free blocks count */ub16 bg_free_inodes_count; /* Free inodes count */ub16 bg_used_dirs_count; /* Directories count */ub16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ub32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ub16 bg_block_bitmap_csum_lo; /* crc32c(s_uuid+grp_num+bitmap) LSB */ub16 bg_inode_bitmap_csum_lo; /* crc32c(s_uuid+grp_num+bitmap) LSB */ub16 bg_itable_unused; /* Unused inodes count */ub16 bg_checksum; /* crc16(sb_uuid+group+desc) */ub32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ub32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ub32 bg_inode_table_hi; /* Inodes table block MSB */ub16 bg_free_blocks_count_hi; /* Free blocks count MSB */ub16 bg_free_inodes_count_hi; /* Free inodes count MSB */ub16 bg_used_dirs_count_hi; /* Directories count MSB */ub16 bg_itable_unused_hi; /* Unused inodes count MSB */ub32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */ub16 bg_block_bitmap_csum_hi; /* crc32c(s_uuid+grp_num+bitmap) MSB */ub16 bg_inode_bitmap_csum_hi; /* crc32c(s_uuid+grp_num+bitmap) MSB */ub32 bg_reserved;
} app_ext4_group_desc64;
4、索引节点
4.1 属性
下表列举出Inode中相对重要的属性。
属性名 | 含义 |
---|---|
i_mode | 文件属性和文件类型。 |
i_size_lo | 文件大小低32位。 |
i_links_count | 硬链接数量。 |
i_flags | 标志位。 |
i_block | 块图或者扩展树,存储文件内容或者目录索引。 |
i_size_high | 文件大小高32位。 |
i_extra_isize | 扩展属性大小。 |
4.2 文件标识
值 | 含义 |
---|---|
0x1000 | S_IFIFO (FIFO) |
0x2000 | S_IFCHR (Character device) |
0x4000 | S_IFDIR (Directory) |
0x6000 | S_IFBLK (Block device) |
0x8000 | S_IFREG (Regular file) |
0xA000 | S_IFLNK (Symbolic link) |
0xC000 | S_IFSOCK (Socket) |
4.3 文件内容
通常情况下,i_block中用于存储文件所有块的索引信息。某些特殊场景下,会用于其它情况。
-
软链接(Symbolic Links)
当链接的目标路径长度小于60时, 会将目标路径存储在i_block中。 -
内联数据(Inline Data)
当文件系统开启Inline Data特性,且数据长度小于156(目前)时,用于存储内容的前60个字节。 -
直接/间接块索引(Direct/Indirect Block Addressing)
i_block[0:11]:存储数据内容的块号。
i_block[12] :指向间接数据块(存储数据块号的数据块)。
i_block[13]:指向双重间接数据块(存储间接数据块的数据块)。
i_block[14]:指向三重间接数据块(存储双重间接数据块的数据块)。
-
扩展树索引(Extent Tree)
通过树的形式管理文件或者文件夹的数据块。扩展树的详细介绍请参考最后一节。
4.4 参考代码
#define EXT4_N_BLOCKS 15
typedef struct {ub16 i_mode; /* File mode */ub16 i_uid; /* Low 16 bits of Owner Uid */ub32 i_size; /* Size in bytes */ub32 i_atime; /* Access time */ub32 i_ctime; /* Inode Change time */ub32 i_mtime; /* Modification time */ub32 i_dtime; /* Deletion Time */ub16 i_gid; /* Low 16 bits of Group Id */ub16 i_links_count; /* Links count */ub32 i_blocks; /* Blocks count */ub32 i_flags; /* File flags */union {struct {ub32 l_i_version; /* was l_i_reserved1 */} linux1;struct {ub32 h_i_translator;} hurd1;} osd1; /* OS dependent 1 */ub32 i_block[EXT4_N_BLOCKS]; /* Pointers to blocks */ub32 i_generation; /* File version (for NFS) */ub32 i_file_acl; /* File ACL */ub32 i_size_high; /* Formerly i_dir_acl, directory ACL */ub32 i_faddr; /* Fragment address */union {struct {ub16 l_i_blocks_hi;ub16 l_i_file_acl_high;ub16 l_i_uid_high; /* these 2 fields */ub16 l_i_gid_high; /* were reserved2[0] */ub16 l_i_checksum_lo; /* crc32c(uuid+inum+inode) */ub16 l_i_reserved;} linux2;struct {ub8 h_i_frag; /* Fragment number */ub8 h_i_fsize; /* Fragment size */ub16 h_i_mode_high;ub16 h_i_uid_high;ub16 h_i_gid_high;ub32 h_i_author;} hurd2;} osd2; /* OS dependent 2 */ub16 i_extra_isize;ub16 i_checksum_hi; /* crc32c(uuid+inum+inode) */ub32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ub32 i_mtime_extra; /* extra Modification time (nsec << 2 | epoch) */ub32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ub32 i_crtime; /* File creation time */ub32 i_crtime_extra; /* extra File creation time (nsec << 2 | epoch)*/ub32 i_version_hi; /* high 32 bits for 64-bit version */
} app_ext4_inode;
5、扩展属性
扩展属性通常用于存储文件的ACLs访问权限和一些其他的安全属性,例如selinux等。因此通常情况下,使用文件系统时并不需要关注文件的扩展属性。
当有一种例外情况,那就是开启了内联数据特性后,文件的一部分数据内容会存储到扩展属性中。
我们可以在2个地方找到文件的扩展属性,其一,2个索引信息的中间;其二,i_file_acl指向的块。而内联数据则存在于第一个地方。
扩展属性块以app_ext4_attr_header
结构开始,但在索引信息后时只存在第一个字段h_magic = 0xEA020000
。
实际的扩展属性用app_ext4_attr_entry
管理,当e_name_index = 7且e_name = data时,则代表内联数据。
typedef struct {ub32 h_magic; /* magic number for identification */ub32 h_refcount; /* reference count */ub32 h_blocks; /* number of disk blocks used */ub32 h_hash; /* hash value of all attributes */ub32 h_reserved[4]; /* zero right now */
} app_ext4_attr_header;typedef struct {ub8 e_name_len; /* length of name */ub8 e_name_index; /* attribute name index */ub16 e_value_offs; /* offset in disk block of value */ub32 e_value_block; /* disk block attribute is stored on (n/i) */ub32 e_value_size; /* size of attribute value */ub32 e_hash; /* hash value of name and value */
} app_ext4_attr_entry;// 获取扩展内联数据
app_ext4_attr_header *attr_header =(app_ext4_attr_header *)((b8 *)&inode_info_->i_extra_isize +inode_info_->i_extra_isize);if (attr_header->h_magic != kExtAttrMagic) return false;// Extended attributes, when stored after the inode,// have a header ext4_xattr_ibody_header that is 4 bytes longapp_ext4_attr_entry *attr_data =(app_ext4_attr_entry *)((b8 *)attr_header + sizeof(attr_header->h_magic));while (attr_data->e_name_index != kExtAttrDataIdx ||attr_data->e_name_len != sizeof(kExtAttrDataName)) {attr_data =(app_ext4_attr_entry *)((b8 *)attr_data + sizeof(app_ext4_attr_entry) +(attr_data->e_name_len + 3) / 4 * 4);}// For an inode attribute e_value_offs is relative to the first entryif (*(b32 *)((b8 *)attr_data + sizeof(app_ext4_attr_entry)) ==kExtAttrDataName) {memcpy(inline_data_,(b8 *)attr_header + sizeof(attr_header->h_magic) +attr_data->e_value_offs,attr_data->e_value_size);}
6、扩展树
由于直接/间接块索引的种种缺陷,在EXT4中推出了扩展树取而代之。扩展树,顾名思义,通过树的形式管理数据块。
其中每个节点以app_ext4_extent_header
开始,非叶子节点时,后接app_ext4_extent_idx
结构;叶子节点则紧跟app_ext4_extent
结构。
app_ext4_extent_header
用于存储当前节点的信息。
变量 | 含义 |
---|---|
eh_magic | 魔数,0xF30A。 |
eh_entries | 当前节点存储的数据个数。 |
eh_depth | 当前节点的深度,0则代表当前是叶子节点。 |
app_ext4_extent
存储实际的数据块信息。
变量 | 含义 |
---|---|
ee_block | 起始的逻辑块地址。 |
ee_len | 当前extent管理的实际物理块个数。ee_len = ee_len > 32768 ? ee_len - 32768 : ee_len |
ee_start_hi / ee_start | 按位或即可得出起始的物理块地址。 |
app_ext4_extent_idx
存储下一层节点的信息。
变量 | 含义 |
---|---|
ei_block | 起始的逻辑块地址。 |
ei_leaf/ ei_leaf_hi | 按位或即可得出下一层节点的物理块地址。 |
typedef struct {ub16 eh_magic; /* probably will support different formats */ub16 eh_entries; /* number of valid entries */ub16 eh_max; /* capacity of store in entries */ub16 eh_depth; /* has tree real underlaying blocks? */ub32 eh_generation; /* generation of the tree */
} app_ext4_extent_header;typedef struct {ub32 ee_block; /* first logical block extent covers */ub16 ee_len; /* number of blocks covered by extent */ub16 ee_start_hi; /* high 16 bits of physical block */ub32 ee_start; /* low 32 bigs of physical block */
} app_ext4_extent;typedef struct {ub32 ei_block; /* index covers logical blocks from 'block' */ub32 ei_leaf; /* pointer to the physical block of the next ** level. leaf or next index could bet here */ub16 ei_leaf_hi; /* high 16 bits of physical block */ub16 ei_unused;
} app_ext4_extent_idx;