前言:
写这个文章的初衷是因为今天手写了一个字典树,然后写字典树以后忽然想到了之前看的技术文章,linux kernel 之前的pid 申请方式已经从 bitmap 变成了 基数树,所以打算写文章再回顾一下这种数据结构算法
一、内核中pid的申请
旧的内核中pid的申请就不多说了,是使用的是bitmap去实现的,新版的换为了基数树。申请pid的内核代码在:
struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,size_t set_tid_size)
{
.......
}
继续看代码,申请函数为:
if (tid) {nr = idr_alloc(&tmp->idr, NULL, tid,tid + 1, GFP_ATOMIC);/** If ENOSPC is returned it means that the PID is* alreay in use. Return EEXIST in that case.*/if (nr == -ENOSPC)nr = -EEXIST;} else {int pid_min = 1;/** init really needs pid 1, but after reaching the* maximum wrap back to RESERVED_PIDS*/if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)pid_min = RESERVED_PIDS;/** Store a null pointer so find_pid_ns does not find* a partially initialized PID (see below).*/nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,pid_max, GFP_ATOMIC);}
进程最大的pid号为:
int pid_max = PID_MAX_DEFAULT;/** This controls the default maximum pid allocated to a process*/
#define PID_MAX_DEFAULT (CONFIG_BASE_SMALL ? 0x1000 : 0x8000)
这里我发现了一个问题,linux 最大的pid 是 0x8000 而不是 0xFFFF,这是为了和老的内核兼容,我们可以通过
cat /proc/sys/kernel/pid_max
查询当前内核的最大pid。
申请pid 函数我们聚焦在
idr_alloc 和 idr_alloc_cyclic
再进去看 idr_alloc_cyclic 的实现
int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp)
{u32 id = idr->idr_next;int err, max = end > 0 ? end - 1 : INT_MAX;if ((int)id < start)id = start;err = idr_alloc_u32(idr, ptr, &id, max, gfp);if ((err == -ENOSPC) && (id > start)) {id = start;err = idr_alloc_u32(idr, ptr, &id, max, gfp);}if (err)return err;idr->idr_next = id + 1;return id;
}
具体实现pid 申请的函数在:
int idr_alloc_u32(struct idr *idr, void *ptr, u32 *nextid,unsigned long max, gfp_t gfp)
{struct radix_tree_iter iter;void __rcu **slot;unsigned int base = idr->idr_base;unsigned int id = *nextid;if (WARN_ON_ONCE(!(idr->idr_rt.xa_flags & ROOT_IS_IDR)))idr->idr_rt.xa_flags |= IDR_RT_MARKER;id = (id < base) ? 0 : id - base;radix_tree_iter_init(&iter, id);slot = idr_get_free(&idr->idr_rt, &iter, gfp, max - base);if (IS_ERR(slot))return PTR_ERR(slot);*nextid = iter.index + base;/* there is a memory barrier inside radix_tree_iter_replace() */radix_tree_iter_replace(&idr->idr_rt, &iter, slot, ptr);radix_tree_iter_tag_clear(&idr->idr_rt, &iter, IDR_FREE);return 0;
}
我们需要重点分析这个函数,重要的数据结构radix_tree_iter,我们看一下这个数据结构:
这是一个基数树的迭代器
/*** struct radix_tree_iter - radix tree iterator state** @index: index of current slot* @next_index: one beyond the last index for this chunk* @tags: bit-mask for tag-iterating* @node: node that contains current slot** This radix tree iterator works in terms of "chunks" of slots. A chunk is a* subinterval of slots contained within one radix tree leaf node. It is* described by a pointer to its first slot and a struct radix_tree_iter* which holds the chunk's position in the tree and its size. For tagged* iteration radix_tree_iter also holds the slots' bit-mask for one chosen* radix tree tag.*/
struct radix_tree_iter {unsigned long index;unsigned long next_index;unsigned long tags;struct radix_tree_node *node;
};
迭代器中有重要的数据结构
#define radix_tree_node xa_node
也就是替换后 变为了
struct xa_node*
我们再继续看 xa_node* 这个数据结构,这是基数树的节点
/** @count is the count of every non-NULL element in the ->slots array* whether that is a value entry, a retry entry, a user pointer,* a sibling entry or a pointer to the next level of the tree.* @nr_values is the count of every element in ->slots which is* either a value entry or a sibling of a value entry.*/
struct xa_node {unsigned char shift; /* Bits remaining in each slot */unsigned char offset; /* Slot offset in parent */unsigned char count; /* Total entry count */unsigned char nr_values; /* Value entry count */struct xa_node __rcu *parent; /* NULL at top of tree */struct xarray *array; /* The array we belong to */union {struct list_head private_list; /* For tree user */struct rcu_head rcu_head; /* Used when freeing node */};void __rcu *slots[XA_CHUNK_SIZE];union {unsigned long tags[XA_MAX_MARKS][XA_MARK_LONGS];unsigned long marks[XA_MAX_MARKS][XA_MARK_LONGS];};
};
这些宏:
#ifndef XA_CHUNK_SHIFT
#define XA_CHUNK_SHIFT (CONFIG_BASE_SMALL ? 4 : 6)
#endif
#define XA_CHUNK_SIZE (1UL << XA_CHUNK_SHIFT)
XA_CHUNK_SIZE 是 64 或者16.
#define XA_MAX_MARKS 3
对应的CHUNK是6个位或者 4个位
然后我们看到pid最大是0x8000, 4 *4 为 16 个 槽,这意味着基数树高最大为3((16 / 6 ) + 1)
#if defined __x86_64__ && !defined __ILP32__
# define __WORDSIZE 64
#else
# define __WORDSIZE 32
#define __WORDSIZE32_SIZE_ULONG 0
#define __WORDSIZE32_PTRDIFF_LONG 0
#endif#ifndef BITS_PER_LONG
# define BITS_PER_LONG __WORDSIZE
#endif
BITS_PER_LONG 为64 或者32
#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))/** The IDR API does not expose the tagging functionality of the radix tree* to users. Use tag 0 to track whether a node has free space below it.*/
#define IDR_FREE 0
带入计算就是:
(64 + 64 -1) / 64 = 1
对应的宏
#define XA_MAX_MARKS 3
#define XA_MARK_LONGS DIV_ROUND_UP(XA_CHUNK_SIZE, BITS_PER_LONG)
初始化基数树的迭代器:
/*** radix_tree_iter_init - initialize radix tree iterator** @iter: pointer to iterator state* @start: iteration starting index* Returns: NULL*/
static __always_inline void __rcu **
radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start)
{/** Leave iter->tags uninitialized. radix_tree_next_chunk() will fill it* in the case of a successful tagged chunk lookup. If the lookup was* unsuccessful or non-tagged then nobody cares about ->tags.** Set index to zero to bypass next_index overflow protection.* See the comment in radix_tree_next_chunk() for details.*/iter->index = 0;iter->next_index = start;return NULL;
}
主要看idr_get_free,这个函数要看懂
shift -= RADIX_TREE_MAP_SHIFT;
shift 最大是16, 每次我们移动是6位,对应的是CHUNKSIZE
if (!tag_get(node, IDR_FREE, offset)) {offset = radix_tree_find_next_bit(node, IDR_FREE,offset + 1);start = next_index(start, node, offset);if (start > max || start == 0)return ERR_PTR(-ENOSPC);while (offset == RADIX_TREE_MAP_SIZE) {offset = node->offset + 1;node = node->parent;if (!node)goto grow;shift = node->shift;}child = rcu_dereference_raw(node->slots[offset]);}
每次循环如果循环到tag不是IDR_FREE,就会继续找下一个tag,直到找到是IDR_FREE的solt。
在这里退出循环
else if (!radix_tree_is_internal_node(child))break;
最后返回空闲槽
iter->index = start;if (node)iter->next_index = 1 + min(max, (start | node_maxindex(node)));elseiter->next_index = 1;iter->node = node;set_iter_tags(iter, node, offset, IDR_FREE);return slot;
上面的start 就应该是要申请的pid,我们最后看next_index
/** The maximum index which can be stored in a radix tree*/
static inline unsigned long shift_maxindex(unsigned int shift)
{return (RADIX_TREE_MAP_SIZE << shift) - 1;
}static inline unsigned long node_maxindex(const struct radix_tree_node *node)
{return shift_maxindex(node->shift);
}static unsigned long next_index(unsigned long index,const struct radix_tree_node *node,unsigned long offset)
{return (index & ~node_maxindex(node)) + (offset << node->shift);
}
所以start 就是要申请的pid。