首页 > 代码库 > 内核源码分析之进程地址空间(基于3.16-rc4)

内核源码分析之进程地址空间(基于3.16-rc4)

所谓进程的地址空间,指的就是进程的虚拟地址空间。当创建一个进程时,内核会为该进程分配一个线性的地址空间(虚拟地址空间),有了虚拟地址空间后,内核就可以通过页表将进程的物理地址地址空间映射到其虚拟地址空间中,程序员所能看到的其实都是虚拟地址,物理地址对程序员而言是透明的。当程序运行时,MMU硬件机制会将程序中的虚拟地址转换成物理地址,然后在内存中找到指令和数据,来执行进程的代码。下面我们就来分析和进程的地址空间相关的各种数据结构和操作。

用到的数据结构:

1.内存描述符struct mm_struct (include/linux/mm_types.h)

  1 struct mm_struct {  2     struct vm_area_struct *mmap;        /* list of VMAs */  3     struct rb_root mm_rb;  4     u32 vmacache_seqnum;                   /* per-thread vmacache */  5 #ifdef CONFIG_MMU  6     unsigned long (*get_unmapped_area) (struct file *filp,  7                 unsigned long addr, unsigned long len,  8                 unsigned long pgoff, unsigned long flags);  9 #endif 10     unsigned long mmap_base;        /* base of mmap area */ 11     unsigned long mmap_legacy_base;         /* base of mmap area in bottom-up allocations */ 12     unsigned long task_size;        /* size of task vm space */ 13     unsigned long highest_vm_end;        /* highest vma end address */ 14     pgd_t * pgd; 15     atomic_t mm_users;            /* How many users with user space? */ 16     atomic_t mm_count;            /* How many references to "struct mm_struct" (users count as 1) */ 17     atomic_long_t nr_ptes;            /* Page table pages */ 18     int map_count;                /* number of VMAs */ 19  20     spinlock_t page_table_lock;        /* Protects page tables and some counters */ 21     struct rw_semaphore mmap_sem; 22  23     struct list_head mmlist;        /* List of maybe swapped mm‘s.    These are globally strung 24                          * together off init_mm.mmlist, and are protected 25                          * by mmlist_lock 26                          */ 27  28  29     unsigned long hiwater_rss;    /* High-watermark of RSS usage */ 30     unsigned long hiwater_vm;    /* High-water virtual memory usage */ 31  32     unsigned long total_vm;        /* Total pages mapped */ 33     unsigned long locked_vm;    /* Pages that have PG_mlocked set */ 34     unsigned long pinned_vm;    /* Refcount permanently increased */ 35     unsigned long shared_vm;    /* Shared pages (files) */ 36     unsigned long exec_vm;        /* VM_EXEC & ~VM_WRITE */ 37     unsigned long stack_vm;        /* VM_GROWSUP/DOWN */ 38     unsigned long def_flags; 39     unsigned long start_code, end_code, start_data, end_data; 40     unsigned long start_brk, brk, start_stack; 41     unsigned long arg_start, arg_end, env_start, env_end; 42  43     unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ 44  45     /* 46      * Special counters, in some configurations protected by the 47      * page_table_lock, in other configurations by being atomic. 48      */ 49     struct mm_rss_stat rss_stat; 50  51     struct linux_binfmt *binfmt; 52  53     cpumask_var_t cpu_vm_mask_var; 54  55     /* Architecture-specific MM context */ 56     mm_context_t context; 57  58     unsigned long flags; /* Must use atomic bitops to access the bits */ 59  60     struct core_state *core_state; /* coredumping support */ 61 #ifdef CONFIG_AIO 62     spinlock_t            ioctx_lock; 63     struct kioctx_table __rcu    *ioctx_table; 64 #endif 65 #ifdef CONFIG_MEMCG 66     /* 67      * "owner" points to a task that is regarded as the canonical 68      * user/owner of this mm. All of the following must be true in 69      * order for it to be changed: 70      * 71      * current == mm->owner 72      * current->mm != mm 73      * new_owner->mm == mm 74      * new_owner->alloc_lock is held 75      */ 76     struct task_struct __rcu *owner; 77 #endif 78  79     /* store ref to file /proc/<pid>/exe symlink points to */ 80     struct file *exe_file; 81 #ifdef CONFIG_MMU_NOTIFIER 82     struct mmu_notifier_mm *mmu_notifier_mm; 83 #endif 84 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS 85     pgtable_t pmd_huge_pte; /* protected by page_table_lock */ 86 #endif 87 #ifdef CONFIG_CPUMASK_OFFSTACK 88     struct cpumask cpumask_allocation; 89 #endif 90 #ifdef CONFIG_NUMA_BALANCING 91     /* 92      * numa_next_scan is the next time that the PTEs will be marked 93      * pte_numa. NUMA hinting faults will gather statistics and migrate 94      * pages to new nodes if necessary. 95      */ 96     unsigned long numa_next_scan; 97  98     /* Restart point for scanning and setting pte_numa */ 99     unsigned long numa_scan_offset;100 101     /* numa_scan_seq prevents two threads setting pte_numa */102     int numa_scan_seq;103 #endif104 #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)105     /*106      * An operation with batched TLB flushing is going on. Anything that107      * can move process memory needs to flush the TLB when moving a108      * PROT_NONE or PROT_NUMA mapped page.109      */110     bool tlb_flush_pending;111 #endif112     struct uprobes_state uprobes_state;113 };
struct mm_struct

每个进程描述符中都包含一个内存描述符,用来存放进程的虚拟地址空间。该结构体中的成员比较多,我们简要的介绍几个,以后用到时再具体分析。第2行的mmap指向了该虚拟地址空间中的第一个线性区,下边有介绍。内核将该虚拟地址空间中的所有线性区组织成一棵红黑数,第3行的mm_rb指向了红黑树的树根。第10行保存了虚拟地址空间的开始地址,第12行保存了该虚拟地址空间的大小,第13行保存了虚拟地址空间的结束地址。第14行pgd保存了进程的页全局目录表的地址。第18行保存了该虚拟地址空间的线性区个数。内核将所有进程的内存描述符(每个进程有一个)通过第23行的mmlist组织成双向链表。第39行分别代表虚拟地址空间中的代码段的起始和结束地址,数据段的起始和结束地址。第40行分别代表堆区的起始和结束地址,堆栈的起始地址。第41行分别代表了进程的命令行参数起始和结束地址,环境变量的起始和结束地址。对于内核线程而言,没有线性区。

2.线性区struct vm_area_struct(include/linux/mm_types.h)

 1 struct vm_area_struct { 2     /* The first cache line has the info for VMA tree walking. */ 3  4     unsigned long vm_start;        /* Our start address within vm_mm. */ 5     unsigned long vm_end;        /* The first byte after our end address 6                        within vm_mm. */ 7  8     /* linked list of VM areas per task, sorted by address */ 9     struct vm_area_struct *vm_next, *vm_prev;10 11     struct rb_node vm_rb;12 13     /*14      * Largest free memory gap in bytes to the left of this VMA.15      * Either between this VMA and vma->vm_prev, or between one of the16      * VMAs below us in the VMA rbtree and its ->vm_prev. This helps17      * get_unmapped_area find a free area of the right size.18      */19     unsigned long rb_subtree_gap;20 21     /* Second cache line starts here. */22 23     struct mm_struct *vm_mm;    /* The address space we belong to. */24     pgprot_t vm_page_prot;        /* Access permissions of this VMA. */25     unsigned long vm_flags;        /* Flags, see mm.h. */26 27     /*28      * For areas with an address space and backing store,29      * linkage into the address_space->i_mmap interval tree, or30      * linkage of vma in the address_space->i_mmap_nonlinear list.31      */32     union {33         struct {34             struct rb_node rb;35             unsigned long rb_subtree_last;36         } linear;37         struct list_head nonlinear;38     } shared;39 40     /*41      * A file‘s MAP_PRIVATE vma can be in both i_mmap tree and anon_vma42      * list, after a COW of one of the file pages.    A MAP_SHARED vma43      * can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack44      * or brk vma (with NULL file) can only be in an anon_vma list.45      */46     struct list_head anon_vma_chain; /* Serialized by mmap_sem &47                       * page_table_lock */48     struct anon_vma *anon_vma;    /* Serialized by page_table_lock */49 50     /* Function pointers to deal with this struct. */51     const struct vm_operations_struct *vm_ops;52 53     /* Information about our backing store: */54     unsigned long vm_pgoff;        /* Offset (within vm_file) in PAGE_SIZE55                        units, *not* PAGE_CACHE_SIZE */56     struct file * vm_file;        /* File we map to (can be NULL). */57     void * vm_private_data;        /* was vm_pte (shared mem) */58 59 #ifndef CONFIG_MMU60     struct vm_region *vm_region;    /* NOMMU mapping region */61 #endif62 #ifdef CONFIG_NUMA63     struct mempolicy *vm_policy;    /* NUMA policy for the VMA */64 #endif65 };
struct vm_area_struct

从上边可以看到,每个内存描述符中包含若干线性区,并且组织成红黑树的形式。内存描述符代表了进程的整个虚拟地址空间,包括了数据段,代码段,堆栈段等等,而每个线性区则用来表示一个段(区),比如代码段和数据段等。因此内存描述符中必然会有多个线性区。对于大型进程而言(比如数据库),它的线性区非常多,这就给搜索线性区带来很大挑战,于是将线性区组织成红黑树的形式,便于快速搜索。下面简单介绍下该结构体中的成员。第4-5行保存了该线性区的起始和结束虚拟地址。内存描述符中的所有线性区会用两种数据结构进行组织,一个是双向链表,另一个就是红黑树,从第9和11行可看出。使用两种数据结构来组织,更加方便于管理,当线性区数量不太多的时候,使用链表来搜索方便又快捷;当线性区数量很庞大时,链表就捉襟见肘了,此时红黑树就能派上用场。二者相互补充相互配合。第23行指向了该线性区所属的内存描述符。第24行存放该线性区的访问权限。第25行存放该线性区所涉及页的相关标志(这些标志表明页是可读的可写的还是可执行的等等)。这些标志定义在include/linux/mm.h中。线性区中的这些访问权限和标志用来设置页表的权限和标志。

线性区的处理函数:

1.find_vma函数(mm/mmap.c)

 1 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) 2 { 3     struct rb_node *rb_node; 4     struct vm_area_struct *vma; 5  6     /* Check the cache first. */ 7     vma = vmacache_find(mm, addr); 8     if (likely(vma)) 9         return vma;10 11     rb_node = mm->mm_rb.rb_node;12     vma = NULL;13 14     while (rb_node) {15         struct vm_area_struct *tmp;16 17         tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);18 19         if (tmp->vm_end > addr) {20             vma = tmp;21             if (tmp->vm_start <= addr)22                 break;23             rb_node = rb_node->rb_left;24         } else25             rb_node = rb_node->rb_right;26     }27 28     if (vma)29         vmacache_update(addr, vma);30     return vma;31 }

该函数接收两个参数,一个是内核描述符,另一个是虚拟地址,然后返回该虚拟地址所在线性区的描述符指针。进程描述符中的vmacache[i]域中装有若干刚使用过的线性区,那么按照程序执行的局部性原理,新给出的虚拟地址在这些线性区中的概率非常大,因此第7行使用vmacache_find函数现在vmacache数组中查找,若找到直接返回。否则,就在红黑树中找。第11行将红黑树的树根保存到rb_node中,第14-26行,遍历整棵红黑树,第17行获取红黑树节点rb_node所在的线性区描述符指针保存到tmp中,第19行如果该线性区的vm_end>addr,那么addr一定在该线性区或者该线性区之前的线性区,则遍历红黑树的左子树,否则遍历右子树,第21行如果addr>=某线性区的vm_start,无疑addr处于该线性区中。(线性区既然组织成红黑树的形式,那么根的左子树所有vm_end一定小于根,根的右子树所有vm_end一定大于根),第30行返回找到的线性区指针,也可能没找到,返回null。

2.find_vma_intersection函数(include/linux/mm.h)

1 static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * mm, unsigned long start_addr, unsigned long end_addr)2 {3     struct vm_area_struct * vma = find_vma(mm,start_addr);4 5     if (vma && end_addr <= vma->vm_start)6         vma = NULL;7     return vma;8 }

该进程查找一个和以start_addr地址开始,以end_addr结束的线性区之后的第一个线性区(可以相连)。

3.get_unmapped_area函数(mm/mmap.c)

 1 unsigned long 2 get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, 3         unsigned long pgoff, unsigned long flags) 4 { 5     unsigned long (*get_area)(struct file *, unsigned long, 6                   unsigned long, unsigned long, unsigned long); 7  8     unsigned long error = arch_mmap_check(addr, len, flags); 9     if (error)10         return error;11 12     /* Careful about overflows.. */13     if (len > TASK_SIZE)14         return -ENOMEM;15 16     get_area = current->mm->get_unmapped_area;17     if (file && file->f_op->get_unmapped_area)18         get_area = file->f_op->get_unmapped_area;19     addr = get_area(file, addr, len, pgoff, flags);20     if (IS_ERR_VALUE(addr))21         return addr;22 23     if (addr > TASK_SIZE - len)24         return -ENOMEM;25     if (addr & ~PAGE_MASK)26         return -EINVAL;27 28     addr = arch_rebalance_pgtables(addr, len);29     error = security_mmap_addr(addr);30     return error ? error : addr;31 }

该函数用来查找可以使用的线性区,参数addr代表要查找线性区的开始地址,len代表线性区大小。如果是为了文件的内核映射而查找,那么参数file不为空,第18行,get_area指针指向file结构体中的get_unmapped_area函数;如果是为了匿名映射查找,第16行get_area指针指向内存描述符中的get_unmapped_area函数。第19行执行get_area函数。file结构体中的get_unmapped_area函数我们不讨论,只讨论内存描述符中的该函数。内存描述符中的该函数有两个版本,分别是arch_get_unmapped_area和arch_get_unmapped_area_topdown,前者从线性地址的低地址向高端地址方向查找,后者从用户态堆栈开始向低地址方向查找,根据需求选择不同的函数。下面分析下arch_get_unmapped_area函数的代码(mm/mmap.c):

 

 1 unsigned long 2 arch_get_unmapped_area(struct file *filp, unsigned long addr, 3         unsigned long len, unsigned long pgoff, unsigned long flags) 4 { 5     struct mm_struct *mm = current->mm; 6     struct vm_area_struct *vma; 7     struct vm_unmapped_area_info info; 8  9     if (len > TASK_SIZE - mmap_min_addr)10         return -ENOMEM;11 12     if (flags & MAP_FIXED)13         return addr;14 15     if (addr) {16         addr = PAGE_ALIGN(addr);17         vma = find_vma(mm, addr);18         if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&19             (!vma || addr + len <= vma->vm_start))20             return addr;21     }22 23     info.flags = 0;24     info.length = len;25     info.low_limit = mm->mmap_base;26     info.high_limit = TASK_SIZE;27     info.align_mask = 0;28     return vm_unmapped_area(&info);29 }

 

第15行将addr地址按照页面地址向上对齐(调整为4K的整数倍),第17行可看出该函数是通过调用find_vma实现查找。由于第15行对addr进行了向上对齐操作,因此找到的vma中可能包含addr(这种情况下就说明以addr地址开始没有可用的vma),也可能在addr之后。第18行如果0<=找到的地址+len<=用户虚拟地址空间大小TASK_SIZE(4TB),并且要么vma为空(就说明该addr的线性区可用),要么vma在addr之后,同时addr+len不能延伸到vma中,返回addr地址。否则的话,就是执行第28行的函数,重新分配。下面看下vm_unmapped_area函数(include/linux/mm.h)

 

1 static inline unsigned long2 vm_unmapped_area(struct vm_unmapped_area_info *info)3 {4     if (!(info->flags & VM_UNMAPPED_AREA_TOPDOWN))5         return unmapped_area(info);6     else7         return unmapped_area_topdown(info);8 }

 

该函数调用unmapped_area或者unmapped_area_topdown来查找空闲的区域。我们分析下前者,后者和前者类似。代码如下:

 

  1 unsigned long unmapped_area(struct vm_unmapped_area_info *info)  2 {  3     /*  4      * We implement the search by looking for an rbtree node that  5      * immediately follows a suitable gap. That is,  6      * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;  7      * - gap_end   = vma->vm_start        >= info->low_limit  + length;  8      * - gap_end - gap_start >= length  9      */ 10  11     struct mm_struct *mm = current->mm; 12     struct vm_area_struct *vma; 13     unsigned long length, low_limit, high_limit, gap_start, gap_end; 14  15     /* Adjust search length to account for worst case alignment overhead */ 16     length = info->length + info->align_mask; 17     if (length < info->length) 18         return -ENOMEM; 19  20     /* Adjust search limits by the desired length */ 21     if (info->high_limit < length) 22         return -ENOMEM; 23     high_limit = info->high_limit - length; 24  25     if (info->low_limit > high_limit) 26         return -ENOMEM; 27     low_limit = info->low_limit + length; 28  29     /* Check if rbtree root looks promising */ 30     if (RB_EMPTY_ROOT(&mm->mm_rb)) 31         goto check_highest; 32     vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); 33     if (vma->rb_subtree_gap < length) 34         goto check_highest; 35  36     while (true) { 37         /* Visit left subtree if it looks promising */ 38         gap_end = vma->vm_start; 39         if (gap_end >= low_limit && vma->vm_rb.rb_left) { 40             struct vm_area_struct *left = 41                 rb_entry(vma->vm_rb.rb_left, 42                      struct vm_area_struct, vm_rb); 43             if (left->rb_subtree_gap >= length) { 44                 vma = left; 45                 continue; 46             } 47         } 48  49         gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; 50 check_current: 51         /* Check if current node has a suitable gap */ 52         if (gap_start > high_limit) 53             return -ENOMEM; 54         if (gap_end >= low_limit && gap_end - gap_start >= length) 55             goto found; 56  57         /* Visit right subtree if it looks promising */ 58         if (vma->vm_rb.rb_right) { 59             struct vm_area_struct *right = 60                 rb_entry(vma->vm_rb.rb_right, 61                      struct vm_area_struct, vm_rb); 62             if (right->rb_subtree_gap >= length) { 63                 vma = right; 64                 continue; 65             } 66         } 67  68         /* Go back up the rbtree to find next candidate node */ 69         while (true) { 70             struct rb_node *prev = &vma->vm_rb; 71             if (!rb_parent(prev)) 72                 goto check_highest; 73             vma = rb_entry(rb_parent(prev), 74                        struct vm_area_struct, vm_rb); 75             if (prev == vma->vm_rb.rb_left) { 76                 gap_start = vma->vm_prev->vm_end; 77                 gap_end = vma->vm_start; 78                 goto check_current; 79             } 80         } 81     } 82  83 check_highest: 84     /* Check highest gap, which does not precede any rbtree node */ 85     gap_start = mm->highest_vm_end; 86     gap_end = ULONG_MAX;  /* Only for VM_BUG_ON below */ 87     if (gap_start > high_limit) 88         return -ENOMEM; 89  90 found: 91     /* We found a suitable gap. Clip it with the original low_limit. */ 92     if (gap_start < info->low_limit) 93         gap_start = info->low_limit; 94  95     /* Adjust gap address to the desired alignment */ 96     gap_start += (info->align_offset - gap_start) & info->align_mask; 97  98     VM_BUG_ON(gap_start + info->length > info->high_limit); 99     VM_BUG_ON(gap_start + info->length > gap_end);100     return gap_start;101 }
unmapped_areae