内核链表

Linux内核有一些基本的数据结构，这些数据结构是Linux实现的基础，对于链表相信大家都不陌生，但是Linux内核中的链表与平常平常我们所使用的链表略有不同，第一次遇到或许会感到困惑。

概览

先来看一个链表的节点，对于一个节点，分为两部分，一部分是数据，另一部分是串联数据的指针。Linux链表节点的定义如下(以下代码皆为3.5版本)：

// include/linux/types.h
struct list_head {
        struct list_head *next, *prev;
};

这里的定义有些奇怪，因为仅有前后节点的指针，并没有数据，就像一串链子，只有线没有线上的珠子，肯定是无法使用，那Linux内核如何把这些“珠子”附着到线上的呢？

来看一个简单的例子：

struce simple {
	int data;
	struct list_head list;
};

simple结构体的list成员指向下一个或者上一个simple的list，这样便把节点串联起来了，data作为“珠子”附着在list线上，但这样仍然有一个问题，list成员仅仅指向下一个simple的list成员，那从list成员如何得到simple节点的地址呢？

答案是根据list成员的地址以及list成员在simple的位置便可以计算出simple对象的地址，这样有些繁琐，Linux提供了一个宏，可以简化这个过程：

// include/linux/list.h
/**
 * list_entry - get the struct for this entry
 * @ptr:        the &struct list_head pointer.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_struct within the struct.
 */
#define list_entry(ptr, type, member) \
        container_of(ptr, type, member)

// include/linux/kernel.h
#define container_of(ptr, type, member) ({                      \
        const typeof( ((type *)0)->member ) *__mptr = (ptr);    \
        (type *)( (char *)__mptr - offsetof(type,member) );})

#undef offsetof
#ifdef __compiler_offsetof
#define offsetof(TYPE,MEMBER) __compiler_offsetof(TYPE,MEMBER)
#else
#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
#endif
#endif /* __KERNEL__ */

可以看到，list_entry直接调用了container_of，container_of分为两句，((type *)0)->member可以获得member在结构体type中的偏移，假设有一个结构体在地址0的位置，那么成员的地址便是成员对结构体的偏移，typeof是gcc的扩展，用于获取变量的类型，offsetof(type,member) 获取member成员在type中的偏移，然后使用member成员的指针ptr（复制成__mptr）减去偏移，即是结构体的地址。在我们的例子中，从list成员的地址获取simple结构的地址如下：

simple * p = list_entry(ptr, struct simple, list);

这样便解决了从list_head上获取附着的数据的问题。接下来需要解决对链表的增删改查的问题：

初始化链表

初始化链表有两种方法，LIST_HEAD_INIT和LIST_HEAD

#define LIST_HEAD_INIT(name) { &(name), &(name) }

#define LIST_HEAD(name) \
        struct list_head name = LIST_HEAD_INIT(name)

static inline void INIT_LIST_HEAD(struct list_head *list)
{
        list->next = list;
        list->prev = list;
}

创建一个指向自身的节点。

插入节点

在节点后插入新节点list_add_tail，和在节点前插入新节点list_add：

/**
 * list_add - add a new entry
 * @new: new entry to be added
 * @head: list head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 */
static inline void list_add(struct list_head *new, struct list_head *head)
{
        __list_add(new, head, head->next);
}

/**     
 * list_add_tail - add a new entry
 * @new: new entry to be added
 * @head: list head to add it before
 *
 * Insert a new entry before the specified head.
 * This is useful for implementing queues.
 */
static inline void list_add_tail(struct list_head *new, struct list_head *head)
{
        __list_add(new, head->prev, head);
}

其中 __list_add 只是普通的链表操作，并无特别之处，可参见Linux源码查看实现。

删除节点

static inline void list_del(struct list_head *entry)
{
        __list_del(entry->prev, entry->next);
        entry->next = LIST_POISON1;
        entry->prev = LIST_POISON2;
}

__list_del 把entry从链表中删除，之后把entry链表指针复制成非空指针（如果使用会出现段错误）

检查是否空链表

判断一个链表是否为空，只需要看头节点是否指向自己便可：

static inline int list_empty(const struct list_head *head)
{
        return head->next == head;
}

遍历链表

遍历是这几种操作中最为复杂的，有四个函数：

#define list_for_each(pos, head) \
        for (pos = (head)->next; pos != (head); pos = pos->next)

#define list_for_each_prev(pos, head) \
        for (pos = (head)->prev; pos != (head); pos = pos->prev)

#define list_for_each_entry(pos, head, member)                          \
        for (pos = list_entry((head)->next, typeof(*pos), member);      \
             &pos->member != (head);    \
             pos = list_entry(pos->member.next, typeof(*pos), member))

#define list_for_each_entry_reverse(pos, head, member)                  \
        for (pos = list_entry((head)->prev, typeof(*pos), member);      \
             &pos->member != (head);    \
             pos = list_entry(pos->member.prev, typeof(*pos), member))

list_for_each 和 list_for_each_prev 较为简单，一个向后遍历，另一个向前遍历，list_for_each_entry和list_for_each_entry_reverse功能相似，不过不是对list_head操作，而是直接对结构体操作，如我们这里的simple结构。根据之前的叙述也不难理解函数实现，只是在list_head上调用了list_entry获取了完整结构。

实例

千言万语不如一个例子来的直观，我们通过一个简单的例子说明一下如何使用内核链表：

#include <linux/list.h>
#include <linux/kernel.h>
#include <stdio.h>

struct simple {
    int data;
    struct list_head list;
};

int main()
{
    int i = 0;
    struct simple * p;
    struct list_head * pos;
    LIST_HEAD(head);
    for (i = 0; i < 10; i++) {
        p = (struct simple*)malloc(sizeof(struct simple));
        p->data = i * 10;
        list_add_tail(&p->list, &head);
    }

    list_for_each_entry(p, &head, list) {
        printf("for %d\n", p->data);
    }

    while (!list_empty(&head)) {
        pos = head.next;
        p = (struct simple*)list_entry(pos,
                struct simple, list);
        list_del(pos);
        printf("del %d\n", p->data);
        free(p);
    }
    return 0;
}

编译参数为

gcc -D__KERNEL__ -I/usr/src/linux-headers-3.2.0-27-generic/include/ -I/usr/src/linux-headers-3.2.0-27-generic/arch/ia64/include/ simple.c

其中头文件中都是内核函数，需要宏__KERNEL__，否则大部分定义会被忽略。

Hash表

定义

链表虽然是最常见的数据结构，但实际使用中，由于链表的检索能力较差，更多的是作为队列和栈结构使用，如果需要查询，比如通过pid查找进程，通过描述符查找inode，就需要用到检索更快的数据结构——Hash表。先来看Hash节点的定义：

struct hlist_head {
	struct hlist_node *first;
};
struct hlist_node {
	struct hlist_node *next, **pprev;
};

其中 hlist_head 为头结点，与链表不同的是还需要一个节点的概念 hlist_node，他们之间的关系如图：

左侧是定长数组，每个节点是 hlist_head ，其中first指向 hlist_node 节点，hlist_node又组成一列链表，hlist_node的链表结构跟 list_head不同之处在于 hlist_node 的 pprev 指向了前一个节点的指针地址。

基本操作

Hash表使用起来也非常简单，首先通过hash函数计算目标值，得到一个索引，在hlist_head数组中找到相应位置，再插入链表。Hash的链表Linux也提供了多种操作方式：

#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h);
static inline void hlist_add_before(struct hlist_node *n, 
		struct hlist_node *next)
static inline void hlist_add_after(struct hlist_node *n, 
		struct hlist_node *next)
static inline void hlist_move_list(struct hlist_head *old,
		struct hlist_head *new);

#define hlist_entry(ptr, type, member) container_of(ptr,type,member)
#define hlist_for_each(pos, head) \     
        for (pos = (head)->first; pos ; pos = pos->next)
#define hlist_for_each_entry(tpos, pos, head, member)                    \
        for (pos = (head)->first;                                        \
             pos &&                                                      \
                ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
             pos = pos->next)

INIT_HLIST_HEAD —— 初始化链表；
hlist_add_head —— 在链表头插入节点；
hlist_add_before —— 在一个节点之前插入；
hlist_add_after —— 在一个节点之后插入；
hlist_entry —— 同list_entry；
hlist_for_each —— 相关的一系列函数进行链表的遍历，与普通双向链表操作相同。

除了Hash表中链表的操作，还有一个重要的元素是Hash函数，Hash函数的好坏直接影响Hash表的性能，Hash函数一般来讲跟具体要实现的业务相关，include/linux/jhash.h下实现了几个不同用途的Hash函数，另外，Linux还给出一个简单的Hash函数：

static inline u32 hash_32(u32 val, unsigned int bits)
{
        /* On some cpus multiply is faster, on others gcc will do shifts */
        u32 hash = val * GOLDEN_RATIO_PRIME_32;

        /* High bits are more random, so use them. */
        return hash >> (32 - bits);
}

这个函数把32位整数Hash成bits位的整数，另外还有 hash_64 等等Hash函数。

实例

最后再来通过一个简单的例子看一下Hash表的使用：

#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/list.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/hash.h>

struct simple_hash
{
    int data;
    struct hlist_node node;
};

struct hlist_head * phash = NULL;

static int __init initialization(void)
{
    int i,k;
    struct hlist_head * phead;
    struct hlist_node * pnode;
    struct simple_hash * p;

    printk(KERN_INFO " init simple start\n");

    phash = (struct hlist_head*)kmalloc(sizeof(struct hlist_head) * 0xFF, GFP_KERNEL);
    for (i = 0; i < 0xFF; i++) {
        INIT_HLIST_HEAD(&phash[i]);
    }
    for (i = 0; i < 10; i++) {
        p = (struct simple_hash*)kmalloc(sizeof(struct simple_hash), GFP_KERNEL);
        k = i * 13;

        p->data = k;
        INIT_HLIST_NODE(&p->node);

        printk(KERN_INFO "insert %d\n", k);
        phead = &phash[hash_32(k, 8)];
        hlist_add_head(&p->node, phead);
    }
    k = 3 * 13;
    phead = &phash[hash_32(k, 8)];
    printk(KERN_INFO "search %d\n", k);
    hlist_for_each_entry(p, pnode, phead, node) {
        if (p->data == k) {
            printk(KERN_INFO " find it\n");
        }
    }

    printk(KERN_INFO "init simple end\n");
    return 0;
}

static void __exit cleanup(void)
{
    int i;
    struct hlist_head * phead = NULL;
    struct simple_hash * p = NULL;
    printk(KERN_INFO "cleanup simple\n");

    if (phash == NULL) {
        return;
    }

    for (i = 0; i < 0xFF; i++) {
        phead = &phash[i];
        while (!hlist_empty(phead)) {
            p = hlist_entry(phead->first, struct simple_hash, node);
            printk(KERN_INFO "delete %d", p->data);
            hlist_del(&p->node);
            kfree(p);
        }
    }
    kfree(phead);
}

module_init(initialization);
module_exit(cleanup);

MODULE_AUTHOR("cppbreak cppbreak@gmail.com");
MODULE_DESCRIPTION("A simple linux kernel module");
MODULE_VERSION("V0.1");
MODULE_LICENSE("Dual BSD/GPL");

这里的代码是一个简单的Linux内核，关于如何写一个简单的Linux内核，可以参阅 Linux字符设备驱动框架

红黑树

定义

Hash表虽然检索复杂度只需要O(1)，但是性能依赖于Hash函数，不合适的Hash函数会导致性能极其低下，而且需要预分配Hash头结点的空间，扩展性和性能稳定性都是问题，所以Linux内核另外还有一套快速插入查询的数据结构——红黑树。

红黑树是一种平衡二叉树，二叉树查找效率是O(logn)，但对于某些极端情况下，一个分支的节点远高于另一个分支，处于不平衡状态下的二叉树性能很差，极端情况跟链表相同。所以出现很多保持二叉树平衡的方法，其中一种便是红黑树，红黑树在二叉树的基础上又规定了一些规则：

性质1：节点是红色或黑色。
性质2：根节点是黑色。
性质3：每个叶节点是黑色的。
性质4：每个红色节点的两个子节点都是黑色。(从每个叶子到根的所有路径上不能有两个连续的红色节点)
性质5：从任一节点到其每个叶子的所有路径都包含相同数目的黑色节点。

红黑树的性质保证了从根节点到任何一个叶子节点的长度都不会相差两倍以上，只要在增删的过程中保持红黑树的性质，也便保持了树的平衡性。

实现

红黑树在内核中的实现位于：

linux/lib/rbtree.c
linux/include/linux/rbtree.h

头文件 rbtree.h 中有红黑树节点的定义：

struct rb_node
{
	unsigned long  rb_parent_color;
#define RB_RED          0
#define RB_BLACK        1
	struct rb_node *rb_right;
	struct rb_node *rb_left;
} __attribute__((aligned(sizeof(long))));
/* The alignment might seem pointless, but allegedly CRIS needs it */

struct rb_root
{
	struct rb_node *rb_node;
};

可以看到红黑树同链表一样，都没有数据块的定义，因为C语言中没有模版，这样实现能使数据结构跟具体的业务分离，rb_root 中仅有一个 rb_node 域，rb_node中有颜色和左右节点的指针，都很容易理解。

初始化

节点的初始化操作，节点默认被置为红色：

static inline void rb_init_node(struct rb_node *rb)
{
	rb->rb_parent_color = 0;
	rb->rb_right = NULL;
	rb->rb_left = NULL;
	RB_CLEAR_NODE(rb);
}

增删操作

红黑树的增加删除操作如下：

// node为已插入的节点，函数检查树的平衡性并修正
void rb_insert_color(struct rb_node *node, struct rb_root *root);
// 删除节点
void rb_erase(struct rb_node *node, struct rb_root *root);

由于二叉树需要比较数据区域的大小，这里仅仅有节点是不够的，所以插入操作需要调用者比较插入，然后调用rb_insert_color调整树的平衡性，rb_erase可以直接删除，另外搜索操作也需要用户根据二叉树的性质自己处理，在头文件rbtree.h开头的注释给了一个使用的例子，这里再试着写一个简单的例子：

例子

#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/rbtree.h>

struct simple
{
    int data;
    struct rb_node node;
};

struct simple * search_simple(struct rb_root * root, int data)
{
    struct simple * p;
    struct rb_node * pnode = root->rb_node;
    while (pnode)
    {
        p = rb_entry(pnode, struct simple, node);
        if (data < p->data) {
            pnode = pnode->rb_left;
        } else if (data > p->data){
            pnode = pnode->rb_right;
        } else {
            return p;
        }
    }
    return NULL;
}

struct simple * insert_simple(struct rb_root * root,
        int data, struct simple * new)
{
    struct simple * p = NULL;
    struct rb_node ** pnode = &root->rb_node;
    struct rb_node * parent = NULL;
    while (*pnode)
    {
        parent = *pnode;
        p = rb_entry(parent, struct simple, node);
        if (data < p->data) {
            pnode = &(*pnode)->rb_left;
        } else if (data > p->data){
            pnode = &(*pnode)->rb_right;
        } else {
            return p;
        }
    }
    rb_link_node(&new->node, parent, pnode);
    rb_insert_color(&new->node, root);
    return p;
}

struct rb_root g_root = RB_ROOT;

static int __init initialization(void)
{
    int i;
    struct simple * p;
    for (i = 0; i < 10; i++) {
        p = (struct simple *)kmalloc(sizeof (struct simple),
                GFP_KERNEL);
        p->data = i * 10;
        rb_init_node(&p->node);
        insert_simple(&g_root, i * 10, p);
        printk(KERN_INFO "insert %d\n", i * 10);
    }
    p = search_simple(&g_root, 30);
    if (p != NULL) {
        printk(KERN_INFO "find it\n");
    }

    return 0;
}

static void __exit cleanup(void)
{
    struct simple * p;
    struct rb_node * pnode = rb_first(&g_root);
    while (pnode != NULL) {
        p = rb_entry(pnode, struct simple, node);        
        printk(KERN_INFO "delete %d\n", p->data);
        rb_erase(pnode, &g_root);
        kfree(p);
        pnode = rb_first(&g_root);
    }
}

module_init(initialization);
module_exit(cleanup);

MODULE_AUTHOR("cppbreak cppbreak@gmail.com");
MODULE_DESCRIPTION("A simple linux kernel module");
MODULE_VERSION("V0.1");
MODULE_LICENSE("Dual BSD/GPL");

程序编译运行请参考 Linux字符设备驱动框架。

Radix Tree

定义

Linux内核实现了大量的数据结构，其中树结构除了前面提到的红黑树以外，还有一种很有意思的树-Radix Tree，翻译为基数树。Radix Tree也是一种key-value结构，要说最快速的key-value查找结构，非数组莫属，只需一次定位便可找到目标值，其缺点是需要占用过多的内存空间，如果连续的key中大多数都是空值，空间的利用率会非常低，如一个整数范围的key，value占一个字节也需要4G的存储，当key非常稀疏的情况下，无疑是一种浪费，Radix Tree的解决方式是分级，简单来说，如果按照整数的尾数分为2级，第一级需要10个指针即可，尾数为0的key映射到第一个指针上，第一个指针再指向存储数组，这样如果只有0-4的key，便可不需要给5-9的指针分配空间，可节省一半的空间。当然这只是简单的例子，内核的Radix Tree更为精炼。

实现

Linux内核的根据Radix Tree的高度把key分为若干段，如下图所示，每段长度固定为RADIX_TREE_MAP_SHIFT，树高度可通过radix_tree_extend扩展，key的每一段分别指定了每一层的位置。

Radix Tree

接下来可以看一些处理细节：

struct radix_tree_node {
	unsigned int    height;         /* Height from the bottom */
	unsigned int    count;
	union {
		struct radix_tree_node *parent; /* Used when ascending tree */
		struct rcu_head rcu_head;       /* Used when freeing node */
	};
	void __rcu      *slots[RADIX_TREE_MAP_SIZE];
	unsigned long   tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
};

radix_tree_node 是基数树的节点定义，其中最主要的字段是 RADIX_TREE_MAP_SIZE 大小的数组 slots，其中可以指向下一层节点或者叶子节点 item。

struct radix_tree_root {
	unsigned int            height;
	gfp_t                   gfp_mask;
	struct radix_tree_node  __rcu *rnode;
};

#define RADIX_TREE_INIT(mask)   {                                       \
	.height = 0,                                                    \
	.gfp_mask = (mask),                                             \
	.rnode = NULL,                                                  \
}

#define RADIX_TREE(name, mask) \
struct radix_tree_root name = RADIX_TREE_INIT(mask)

radix_tree_root 是根节点类型，记录了数高度 height 和第一层节点rnode，RADIX_TREE_INIT宏对根节点初始化。

接下来看基数树的核心，如何插入节点和如何查找节点：

int radix_tree_insert(struct radix_tree_root *root,
				unsigned long index, void *item)
{
	/** 省略 */
	/* Make sure the tree is high enough.  */
	if (index > radix_tree_maxindex(root->height)) {
		error = radix_tree_extend(root, index);
		if (error)
			return error;
	}

	slot = indirect_to_ptr(root->rnode);

	height = root->height;
	shift = (height-1) * RADIX_TREE_MAP_SHIFT;

	offset = 0;                     /* uninitialised var warning */
	while (height > 0) {
		if (slot == NULL) {
			/* Have to add a child node.  */
			if (!(slot = radix_tree_node_alloc(root)))
				return -ENOMEM;
			slot->height = height;
			slot->parent = node;
			if (node) {
				rcu_assign_pointer(node->slots[offset], slot);
				node->count++;
			} else
				rcu_assign_pointer(root->rnode, ptr_to_indirect(slot));
		}

		/* Go a level down */
		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
		node = slot;
		slot = node->slots[offset];
		shift -= RADIX_TREE_MAP_SHIFT;
		height--;
	}

	/** 省略 */
	return 0;
}

篇幅关系简略了代码，代码首先使用 radix_tree_maxindex 计算树高度是否能够存储 index，不行就调用 radix_tree_extend 扩展树高度，紧接着根据key的每个字段定位节点，直到到达树叶子节点，如果树节点未分配空间，则调用 radix_tree_node_alloc 分配，最后再把 item 挂接到叶子节点上，完成插入操作。

基数树查找操作为：

static void *radix_tree_lookup_element(struct radix_tree_root *root,
			unsigned long index, int is_slot);

void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
{
	return radix_tree_lookup_element(root, index, 0);
}

其中 radix_tree_lookup_element 查找过程与插入操作类似，比较容易理解。