首页 > 代码库 > Linux ioremap 的实现

Linux ioremap 的实现

Linux ioremap 的实现

linux, memory, ioremap

在 linux kernel 的代码中,经常看到 ioremap 函数。
其功能是将给定的物理地址映射为虚拟地址。
注意,此处的物理地址并不是真正内存的物理地址,而是cpu上的io memory。
可以参考芯片《Reference Manual》中断 memory map 章节。
本文主要学习 ioremap 是如何实现的。

ioremap 的定义:
#define ioremap(cookie,size)  __arch_ioremap((cookie), (size), MT_DEVICE)

#define MT_DEVICE  0

#define __arch_ioremap   __arm_ioremap

void __iomem *
__arm_ioremap(unsigned long phys_addr, size_t size, unsigned int mtype)
{
 return __arm_ioremap_caller(phys_addr, size, mtype,
   __builtin_return_address(0));
}

void __iomem *__arm_ioremap_caller(unsigned long phys_addr, size_t size,
 unsigned int mtype, void *caller)
{
 unsigned long last_addr;
  unsigned long offset = phys_addr & ~PAGE_MASK;
  unsigned long pfn = __phys_to_pfn(phys_addr);

  /*
   * Don‘t allow wraparound or zero size
  */
 last_addr = phys_addr + size - 1;
 if (!size || last_addr < phys_addr)
  return NULL;

 return __arm_ioremap_pfn_caller(pfn, offset, size, mtype,
   caller);
}

void __iomem * __arm_ioremap_pfn_caller(unsigned long pfn,
 unsigned long offset, size_t size, unsigned int mtype, void *caller)
{
 const struct mem_type *type;
 int err;
 unsigned long addr;
  struct vm_struct * area;

 /*
  * High mappings must be supersection aligned
  */
 // 高端内存需要对齐到 supersection
 if (pfn >= 0x100000 && (__pfn_to_phys(pfn) & ~SUPERSECTION_MASK))
  return NULL;

 /*
  * Don‘t allow RAM to be mapped - this causes problems with ARMv6+
  */
 // map 的不能是 RAM ,只能是 soc 的 io memory
 /*
 int pfn_valid(unsigned long pfn)
 {
  return memblock_is_memory(pfn << PAGE_SHIFT);
 }
 */
 if (WARN_ON(pfn_valid(pfn)))
  return NULL;

 // get_mem_type 的实现见后文
 // 从前文的定义可知, mtype 为 MT_DEVICE
 type = get_mem_type(mtype);
 if (!type)
  return NULL;

 /*
  * Page align the mapping size, taking account of any offset.
  */
 size = PAGE_ALIGN(offset + size);

 // get_vm_area_caller 函数的实现见后面
 area = get_vm_area_caller(size, VM_IOREMAP, caller);
  if (!area)
   return NULL;
  addr = (unsigned long)area->addr;

#ifndef CONFIG_SMP
 if (DOMAIN_IO == 0 &&
     (((cpu_architecture() >= CPU_ARCH_ARMv6) && (get_cr() & CR_XP)) ||
        cpu_is_xsc3()) && pfn >= 0x100000 &&
        !((__pfn_to_phys(pfn) | size | addr) & ~SUPERSECTION_MASK)) {
  area->flags |= VM_ARM_SECTION_MAPPING;
  err = remap_area_supersections(addr, pfn, size, type);
 } else if (!((__pfn_to_phys(pfn) | size | addr) & ~PMD_MASK)) {
  area->flags |= VM_ARM_SECTION_MAPPING;
  err = remap_area_sections(addr, pfn, size, type);
 } else
#endif
  // ioremap_page_range 函数的实现见后文
  err = ioremap_page_range(addr, addr + size, __pfn_to_phys(pfn),
      __pgprot(type->prot_pte));

 if (err) {
   vunmap((void *)addr);
   return NULL;
  }

 flush_cache_vmap(addr, addr + size);
 return (void __iomem *) (offset + addr);
}

get_mem_type 函数的实现:
const struct mem_type *get_mem_type(unsigned int type)
{
 return type < ARRAY_SIZE(mem_types) ? &mem_types[type] : NULL;
}
mem_types 的定义:
static struct mem_type mem_types[] = {
 [MT_DEVICE] = {    /* Strongly ordered / ARMv6 shared device */
  .prot_pte = PROT_PTE_DEVICE | L_PTE_MT_DEV_SHARED |
      L_PTE_SHARED,
  .prot_l1 = PMD_TYPE_TABLE,
  .prot_sect = PROT_SECT_DEVICE | PMD_SECT_S,
  .domain  = DOMAIN_IO,
 },
 [MT_DEVICE_NONSHARED] = { /* ARMv6 non-shared device */
  .prot_pte = PROT_PTE_DEVICE | L_PTE_MT_DEV_NONSHARED,
  .prot_l1 = PMD_TYPE_TABLE,
  .prot_sect = PROT_SECT_DEVICE,
  .domain  = DOMAIN_IO,
 },
 [MT_DEVICE_CACHED] = {   /* ioremap_cached */
  .prot_pte = PROT_PTE_DEVICE | L_PTE_MT_DEV_CACHED,
  .prot_l1 = PMD_TYPE_TABLE,
  .prot_sect = PROT_SECT_DEVICE | PMD_SECT_WB,
  .domain  = DOMAIN_IO,
 }, 
 [MT_DEVICE_WC] = { /* ioremap_wc */
  .prot_pte = PROT_PTE_DEVICE | L_PTE_MT_DEV_WC,
  .prot_l1 = PMD_TYPE_TABLE,
  .prot_sect = PROT_SECT_DEVICE,
  .domain  = DOMAIN_IO,
 },
 [MT_UNCACHED] = {
  .prot_pte = PROT_PTE_DEVICE,
  .prot_l1 = PMD_TYPE_TABLE,
  .prot_sect = PMD_TYPE_SECT | PMD_SECT_XN,
  .domain  = DOMAIN_IO,
 },
 [MT_CACHECLEAN] = {
  .prot_sect = PMD_TYPE_SECT | PMD_SECT_XN,
  .domain    = DOMAIN_KERNEL,
 },
 [MT_MINICLEAN] = {
  .prot_sect = PMD_TYPE_SECT | PMD_SECT_XN | PMD_SECT_MINICACHE,
  .domain    = DOMAIN_KERNEL,
 },
 [MT_LOW_VECTORS] = {
  .prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
    L_PTE_RDONLY,
  .prot_l1   = PMD_TYPE_TABLE,
  .domain    = DOMAIN_USER,
 },
 [MT_HIGH_VECTORS] = {
  .prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
    L_PTE_USER | L_PTE_RDONLY,
  .prot_l1   = PMD_TYPE_TABLE,
  .domain    = DOMAIN_USER,
 },
 [MT_MEMORY] = {
  .prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY,
  .prot_l1   = PMD_TYPE_TABLE,
  .prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
  .domain    = DOMAIN_KERNEL,
 },
 [MT_ROM] = {
  .prot_sect = PMD_TYPE_SECT,
  .domain    = DOMAIN_KERNEL,
 },
 [MT_MEMORY_NONCACHED] = {
  .prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
    L_PTE_MT_BUFFERABLE,
  .prot_l1   = PMD_TYPE_TABLE,
  .prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
  .domain    = DOMAIN_KERNEL,
 },
 [MT_MEMORY_DTCM] = {
  .prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
    L_PTE_XN,
  .prot_l1   = PMD_TYPE_TABLE,
  .prot_sect = PMD_TYPE_SECT | PMD_SECT_XN,
  .domain    = DOMAIN_KERNEL,
 },
 [MT_MEMORY_ITCM] = {
  .prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY,
  .prot_l1   = PMD_TYPE_TABLE,
  .domain    = DOMAIN_KERNEL,
 },
};

get_vm_area_caller 函数的实现:
struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
    void *caller)
{
/*
 * Just any arbitrary offset to the start of the vmalloc VM area: the
 * current 8MB value just means that there will be a 8MB "hole" after the
 * physical memory until the kernel virtual memory starts.  That means that
 * any out-of-bounds memory accesses will hopefully be caught.
 * The vmalloc() routines leaves a hole of 4kB between each vmalloced
 * area for the same reason. ;)
 *
 * Note that platforms may override VMALLOC_START, but they must provide
 * VMALLOC_END.  VMALLOC_END defines the (exclusive) limit of this space,
 * which may not overlap IO space.
 */
/*
#ifndef VMALLOC_START
#define VMALLOC_OFFSET  (8*1024*1024)
// high_memory 在 arch/arm/mm/init.c 文件中的 bootmem_init 函数中赋值,该函数的实现见后文
#define VMALLOC_START  (((unsigned long)high_memory + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1))
#endif
*/
/* vmalloc ending address */
#define VMALLOC_END       0xf2000000UL
 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
      -1, GFP_KERNEL, caller);
}

static struct vm_struct *__get_vm_area_node(unsigned long size,
  unsigned long align, unsigned long flags, unsigned long start,
  unsigned long end, int node, gfp_t gfp_mask, void *caller)
{
 static struct vmap_area *va;
 struct vm_struct *area;

 BUG_ON(in_interrupt());
 
 /* bits in flags of vmalloc‘s vm_struct below */
 // #define VM_IOREMAP 0x00000001 /* ioremap() and friends */
 if (flags & VM_IOREMAP) {
  int bit = fls(size);

  if (bit > IOREMAP_MAX_ORDER)
   bit = IOREMAP_MAX_ORDER;
  else if (bit < PAGE_SHIFT)
   bit = PAGE_SHIFT;

  align = 1ul << bit;
 }

 size = PAGE_ALIGN(size);
 if (unlikely(!size))
  return NULL;

 /**
  * kzalloc_node - allocate zeroed memory from a particular memory node.
  * @size: how many bytes of memory are required.
  * @flags: the type of memory to allocate (see kmalloc).
  * @node: memory node from which to allocate
  */
 // 分配一个 vm_struct 结构体
 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
 if (unlikely(!area))
  return NULL;

 /*
  * We always allocate a guard page.
  */
 size += PAGE_SIZE;

 // start 和 end 分别为 VMALLOC_START 和 VMALLOC_END
 // align 为 1
 // alloc_vmap_area 函数的注释:
 /*
  * Allocate a region of KVA of the specified size and alignment, within the
  * vstart and vend.
  */
 // 已经使用的 vm 的信息分别存在各个 vmap_area 结构体中
 // 所有的 vmap_area 结构体都在红黑树 vmap_area_root 中
 // alloc_vmap_area 函数的主要功能是,查找红黑树 vmap_area_root ,找到 start 和 end 之间满足 size 大小的未使用空间,
 // 创建一个 vmap_area 结构体,并用找到的未使用空间信息初始化该结构体,然后将该结构体插入到红黑树 vmap_area_root 中
 va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
 if (IS_ERR(va)) {
  kfree(area);
  return NULL;
 }

 /*
  * When this function is called from __vmalloc_node_range,
  * we do not add vm_struct to vmlist here to avoid
  * accessing uninitialized members of vm_struct such as
  * pages and nr_pages fields. They will be set later.
  * To distinguish it from others, we use a VM_UNLIST flag.
  */
 if (flags & VM_UNLIST)
  setup_vmalloc_vm(area, va, flags, caller);
 else
  // 看前面注释可知,上面的 if 分支只是个特殊情况,我们只分析 else 分支
  // insert_vmalloc_vm 函数的实现见后文
  insert_vmalloc_vm(area, va, flags, caller);

 return area;
}

文件中的 bootmem_init 函数中赋值,该函数的实现:
void __init bootmem_init(void)
{
 unsigned long min, max_low, max_high;

 max_low = max_high = 0;

 // 找到内存的起始地址, 低端内存的最高地址, 高端内存的最高地址
 // find_limits 函数实现见后文
 find_limits(&min, &max_low, &max_high);

 arm_bootmem_init(min, max_low);

 /*
  * Sparsemem tries to allocate bootmem in memory_present(),
  * so must be done after the fixed reservations
  */
 arm_memory_present();

 /*
  * sparse_init() needs the bootmem allocator up and running.
  */
 sparse_init();

 /*
  * Now free the memory - free_area_init_node needs
  * the sparse mem_map arrays initialized by sparse_init()
  * for memmap_init_zone(), otherwise all PFNs are invalid.
  */
 arm_bootmem_free(min, max_low, max_high);

 // high_memory 为高端内存的起始虚拟地址
 high_memory = __va(((phys_addr_t)max_low << PAGE_SHIFT) - 1) + 1;

 /*
  * This doesn‘t seem to be used by the Linux memory manager any
  * more, but is used by ll_rw_block.  If we can get rid of it, we
  * also get rid of some of the stuff above as well.
  *
  * Note: max_low_pfn and max_pfn reflect the number of _pages_ in
  * the system, not the maximum PFN.
  */
 max_low_pfn = max_low - PHYS_PFN_OFFSET;
 max_pfn = max_high - PHYS_PFN_OFFSET;
}

find_limits 函数实现:
static void __init find_limits(unsigned long *min, unsigned long *max_low,
 unsigned long *max_high)
{
 struct meminfo *mi = &meminfo;
 int i;

 *min = -1UL;
 *max_low = *max_high = 0;

 for_each_bank (i, mi) {
  struct membank *bank = &mi->bank[i];
  unsigned long start, end;

  start = bank_pfn_start(bank);
  end = bank_pfn_end(bank);

  if (*min > start)
   *min = start;
  if (*max_high < end)
   *max_high = end;
  // 如果是高端内存,就不用更新 max_low 了
  // 参考后面的 sanity_check_meminfo 函数
  if (bank->highmem)
   continue;
  if (*max_low < end)
   *max_low = end;
 }
}

sanity_check_meminfo 函数的实现:
void __init sanity_check_meminfo(void)
{
 int i, j, highmem = 0;

 for (i = 0, j = 0; i < meminfo.nr_banks; i++) {
  struct membank *bank = &meminfo.bank[j];
  *bank = meminfo.bank[i];

#ifdef CONFIG_HIGHMEM
  // static void * __initdata vmalloc_min = (void *)(VMALLOC_END - SZ_128M);
  if (__va(bank->start) >= vmalloc_min ||
      __va(bank->start) < (void *)PAGE_OFFSET)
   highmem = 1;

  bank->highmem = highmem;

  ...
#else
  bank->highmem = highmem;

  ...
 }
 ...
}

回到函数 get_vm_area_caller 的实现。

insert_vmalloc_vm 函数的实现:
static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
         unsigned long flags, void *caller)
{
 // vm(vm_struct) 结构体在函数 __get_vm_area_node 中分配
 // va(vmap_area) 结构体,在函数 __get_vm_area_node 中通过调用 alloc_vmap_area 分配
 setup_vmalloc_vm(vm, va, flags, caller);
 insert_vmalloc_vmlist(vm);
}

setup_vmalloc_vm 函数的实现:
static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
         unsigned long flags, void *caller)
{
 vm->flags = flags;
 vm->addr = (void *)va->va_start;
 vm->size = va->va_end - va->va_start;
 vm->caller = caller;
 va->vm = vm;
 va->flags |= VM_VM_AREA;
}

insert_vmalloc_vmlist 函数的实现:
static void insert_vmalloc_vmlist(struct vm_struct *vm)
{
 struct vm_struct *tmp, **p;

 vm->flags &= ~VM_UNLIST;
 write_lock(&vmlist_lock);
 // 将 vm_struct 结构体插入的 vmlist 中
 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
  if (tmp->addr >= vm->addr)
   break;
 }
 vm->next = *p;
 *p = vm;
 write_unlock(&vmlist_lock);
}

ioremap_page_range 函数的实现:
int ioremap_page_range(unsigned long addr,
         unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
{
 pgd_t *pgd;
 unsigned long start;
 unsigned long next;
 int err;

 BUG_ON(addr >= end);

 start = addr;
 phys_addr -= addr;
 pgd = pgd_offset_k(addr);
 do {
  next = pgd_addr_end(addr, end);
  err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, prot);
  if (err)
   break;
 } while (pgd++, addr = next, addr != end);

 flush_cache_vmap(start, end);

 return err;
}

static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr,
  unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
{
 pud_t *pud;
 unsigned long next;

 phys_addr -= addr;
 pud = pud_alloc(&init_mm, pgd, addr);
 if (!pud)
  return -ENOMEM;
 do {
  next = pud_addr_end(addr, end);
  if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, prot))
   return -ENOMEM;
 } while (pud++, addr = next, addr != end);
 return 0;
}

static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
  unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
{
 pmd_t *pmd;
 unsigned long next;

 phys_addr -= addr;
 pmd = pmd_alloc(&init_mm, pud, addr);
 if (!pmd)
  return -ENOMEM;
 do {
  next = pmd_addr_end(addr, end);
  if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, prot))
   return -ENOMEM;
 } while (pmd++, addr = next, addr != end);
 return 0;
}

static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
  unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
{
 pte_t *pte;
 u64 pfn;

 pfn = phys_addr >> PAGE_SHIFT;
 pte = pte_alloc_kernel(pmd, addr);
 if (!pte)
  return -ENOMEM;
 do {
  BUG_ON(!pte_none(*pte));
  set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
  pfn++;
 } while (pte++, addr += PAGE_SIZE, addr != end);
 return 0;
}

上面几个函数的功能,是建立 linux 4级页表。
linux 4级页表参考:
http://larmbr.me/2014/01/19/the-evolution-of-4-level-page-talbe-in-linux/

总价一下。
ioremap中首先做了一些检查,其中一项检查是要处理的物理地址是不是 RAM ,因为 ioremap 只处理 soc 的 io memory ,不处理 RAM 。
分配一个 vm_struct 结构体。
之后分配一个 vmap_area 结构体,并查找红黑树 vmap_area_root 找到合适的 hole 。
然后初始化 vm_struct 结构体和 vmap_area 结构体的一些成员。
最后建立 linux 的4级内存页表。
4级即: PGD -> PUD -> PMD -> PTE

 

Linux ioremap 的实现