14.6 shrink_inactive_list函数
shrink_inactive_list()函数扫描不活跃LRU链表去尝试回收页面,并且返回已经回收的页面数量,代码如下:
[kswapd()->balance_pgdat()->kswapd_shrink_zone()->shrink_zone()->shrink_lruvec()->shrink_active_list()]
/** shrink_inactive_list() is a helper for shrink_zone(). It returns the number* of reclaimed pages*/
static noinline_for_stack unsigned long
shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,struct scan_control *sc, enum lru_list lru)
{/*初始化一个临时链表page_list*/LIST_HEAD(page_list);unsigned long nr_scanned;unsigned long nr_reclaimed = 0;unsigned long nr_taken;unsigned long nr_dirty = 0;unsigned long nr_congested = 0;unsigned long nr_unqueued_dirty = 0;unsigned long nr_writeback = 0;unsigned long nr_immediate = 0;isolate_mode_t isolate_mode = 0;int file = is_file_lru(lru);struct zone *zone = lruvec_zone(lruvec);struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;while (unlikely(too_many_isolated(zone, file, sc))) {congestion_wait(BLK_RW_ASYNC, HZ/10);/* We are about to die and free our memory. Return now. */if (fatal_signal_pending(current))return SWAP_CLUSTER_MAX;}lru_add_drain();if (!sc->may_unmap)isolate_mode |= ISOLATE_UNMAPPED;if (!sc->may_writepage)isolate_mode |= ISOLATE_CLEAN;spin_lock_irq(&zone->lru_lock);/*把不活跃链表的页面分离到临时链表page_list中。*/nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,&nr_scanned, sc, isolate_mode, lru);__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);if (global_reclaim(sc)) {__mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);if (current_is_kswapd())__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);else__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);}spin_unlock_irq(&zone->lru_lock);if (nr_taken == 0)return 0;/*扫描page_list链表的页面并返回已回收的页面数量,下面查看此函数实现*/nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,&nr_dirty, &nr_unqueued_dirty, &nr_congested,&nr_writeback, &nr_immediate,false);spin_lock_irq(&zone->lru_lock);reclaim_stat->recent_scanned[file] += nr_taken;if (global_reclaim(sc)) {if (current_is_kswapd())__count_zone_vm_events(PGSTEAL_KSWAPD, zone,nr_reclaimed);else__count_zone_vm_events(PGSTEAL_DIRECT, zone,nr_reclaimed);}/*扫描page_list链表,并把相应的page添加到对应LRU链表中。有些满足释放条件的page,即已经回收的页面将会在free_hot_cold_page_list()函数中被释放*/putback_inactive_pages(lruvec, &page_list);__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);spin_unlock_irq(&zone->lru_lock);mem_cgroup_uncharge_list(&page_list);free_hot_cold_page_list(&page_list, true);/** If reclaim is isolating dirty pages under writeback, it implies* that the long-lived page allocation rate is exceeding the page* laundering rate. Either the global limits are not being effective* at throttling processes due to the page distribution throughout* zones or there is heavy usage of a slow backing device. The* only option is to throttle from reclaim context which is not ideal* as there is no guarantee the dirtying process is throttled in the* same way balance_dirty_pages() manages.** Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number* of pages under pages flagged for immediate reclaim and stall if any* are encountered in the nr_immediate check below.*/if (nr_writeback && nr_writeback == nr_taken)set_bit(ZONE_WRITEBACK, &zone->flags);/** memcg will stall in page writeback so only consider forcibly* stalling for global reclaim*/if (global_reclaim(sc)) {/** Tag a zone as congested if all the dirty pages scanned were* backed by a congested BDI and wait_iff_congested will stall.*/if (nr_dirty && nr_dirty == nr_congested)set_bit(ZONE_CONGESTED, &zone->flags);/** If dirty pages are scanned that are not queued for IO, it* implies that flushers are not keeping up. In this case, flag* the zone ZONE_DIRTY and kswapd will start writing pages from* reclaim context.*/if (nr_unqueued_dirty == nr_taken)set_bit(ZONE_DIRTY, &zone->flags);/** If kswapd scans pages marked marked for immediate* reclaim and under writeback (nr_immediate), it implies* that pages are cycling through the LRU faster than* they are written so also forcibly stall.*/if (nr_immediate && current_may_throttle())congestion_wait(BLK_RW_ASYNC, HZ/10);}/** Stall direct reclaim for IO completions if underlying BDIs or zone* is congested. Allow kswapd to continue until it starts encountering* unqueued dirty pages or cycling through the LRU too quickly.*/if (!sc->hibernation_mode && !current_is_kswapd() &¤t_may_throttle())wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,zone_idx(zone),nr_scanned, nr_reclaimed,sc->priority,trace_shrink_flags(file));return nr_reclaimed;
}
shrink_page_list()函数实现:
对于dirty和writeback的页面会考虑到块设备回写的堵塞问题。可以省略不看这部分代码。
[kswapd()->balance_pgdat()->kswapd_shrink_zone()->shrink_zone()->shrink_lruvec()->shrink_active_list()->shrink_page_list]
/** shrink_page_list() returns the number of reclaimed pages*/
static unsigned long shrink_page_list(struct list_head *page_list,struct zone *zone,struct scan_control *sc,enum ttu_flags ttu_flags,unsigned long *ret_nr_dirty,unsigned long *ret_nr_unqueued_dirty,unsigned long *ret_nr_congested,unsigned long *ret_nr_writeback,unsigned long *ret_nr_immediate,bool force_reclaim)
{/*初始链表*/LIST_HEAD(ret_pages);LIST_HEAD(free_pages);int pgactivate = 0;unsigned long nr_unqueued_dirty = 0;unsigned long nr_dirty = 0;unsigned long nr_congested = 0;unsigned long nr_reclaimed = 0;unsigned long nr_writeback = 0;unsigned long nr_immediate = 0;cond_resched();/*while循环扫描page_list链表,这个链表的成员都是不活跃页面。*/while (!list_empty(page_list)) {struct address_space *mapping;struct page *page;int may_enter_fs;enum page_references references = PAGEREF_RECLAIM_CLEAN;bool dirty, writeback;cond_resched();page = lru_to_page(page_list);list_del(&page->lru);/*尝试获取page的PG_lock锁,如果获取不成功,那么page将继续保留在不活跃LRU链表中。*/if (!trylock_page(page))goto keep;VM_BUG_ON_PAGE(PageActive(page), page);VM_BUG_ON_PAGE(page_zone(page) != zone, page);sc->nr_scanned++;if (unlikely(!page_evictable(page)))goto cull_mlocked;/*判断是否允许回收映射的页面,sc->may_unmap为1,表示允许回收映射的页面。*/if (!sc->may_unmap && page_mapped(page))goto keep_locked;/* Double the slab pressure for mapped and swapcache pages */if (page_mapped(page) || PageSwapCache(page))sc->nr_scanned++;may_enter_fs = (sc->gfp_mask & __GFP_FS) ||(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));/** The number of dirty pages determines if a zone is marked* reclaim_congested which affects wait_iff_congested. kswapd* will stall and start writing pages if the tail of the LRU* is all dirty unqueued pages.*/page_check_dirty_writeback(page, &dirty, &writeback);if (dirty || writeback)nr_dirty++;if (dirty && !writeback)nr_unqueued_dirty++;/** Treat this page as congested if the underlying BDI is or if* pages are cycling through the LRU so quickly that the* pages marked for immediate reclaim are making it to the* end of the LRU a second time.*/mapping = page_mapping(page);if (((dirty || writeback) && mapping &&bdi_write_congested(inode_to_bdi(mapping->host))) ||(writeback && PageReclaim(page)))nr_congested++;/** If a page at the tail of the LRU is under writeback, there* are three cases to consider.** 1) If reclaim is encountering an excessive number of pages* under writeback and this page is both under writeback and* PageReclaim then it indicates that pages are being queued* for IO but are being recycled through the LRU before the* IO can complete. Waiting on the page itself risks an* indefinite stall if it is impossible to writeback the* page due to IO error or disconnected storage so instead* note that the LRU is being scanned too quickly and the* caller can stall after page list has been processed.** 2) Global reclaim encounters a page, memcg encounters a* page that is not marked for immediate reclaim or* the caller does not have __GFP_IO. In this case mark* the page for immediate reclaim and continue scanning.** __GFP_IO is checked because a loop driver thread might* enter reclaim, and deadlock if it waits on a page for* which it is needed to do the write (loop masks off* __GFP_IO|__GFP_FS for this reason); but more thought* would probably show more reasons.** Don't require __GFP_FS, since we're not going into the* FS, just waiting on its writeback completion. Worryingly,* ext4 gfs2 and xfs allocate pages with* grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing* may_enter_fs here is liable to OOM on them.** 3) memcg encounters a page that is not already marked* PageReclaim. memcg does not have any dirty pages* throttling so we could easily OOM just because too many* pages are in writeback and there is nothing else to* reclaim. Wait for the writeback to complete.*//*如果page有PG_PageWriteback标志位,说明page正在往磁盘里回写。这时最好让page继续保持在不活跃LRU链表中。在linux 3.11之前的内核,很多用户抱怨大文件复制或备份操作会导致系统宕机或应用被swap出去。有时内存短缺的情况下,突然有大量的内存要被回收,而有时应用程序或kswapd线程的CPU占用率长时间为100%,因此Linux 3.11以后的内核对此进行了优化,对于处于回写状态的页面会做统计,如果shrink_page_list()扫描一轮之后发现有大量处于回写状态的页面,则会设置zone->flag中的ZONE_WRITEBACK标志位。在下一轮扫描时,如果kswapd内核线程还遇到回写页面,那么就认为LRU扫描速度比页面IO回写速度快,这时会强制让kswapd睡眠等待100毫秒(congestion_wait(BLK_RW_ASYSC, HZ/10))*/if (PageWriteback(page)) {/* Case 1 above */if (current_is_kswapd() &&PageReclaim(page) &&test_bit(ZONE_WRITEBACK, &zone->flags)) {nr_immediate++;goto keep_locked;/* Case 2 above */} else if (global_reclaim(sc) ||!PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {/** This is slightly racy - end_page_writeback()* might have just cleared PageReclaim, then* setting PageReclaim here end up interpreted* as PageReadahead - but that does not matter* enough to care. What we do want is for this* page to have PageReclaim set next time memcg* reclaim reaches the tests above, so it will* then wait_on_page_writeback() to avoid OOM;* and it's also appropriate in global reclaim.*/SetPageReclaim(page);nr_writeback++;goto keep_locked;/* Case 3 above */} else {wait_on_page_writeback(page);}}/*page_check_references()函数计算该页面访问引用pte的用户数,并返回page_references的状态,简单归纳如下:(1) 如果有访问引用PTE@ 该页是匿名页面(PageSwapBacked(page)),则加入活跃链表。@ 最近第二次访问的page cache或者共享的page cache,则加入活跃链表。@ 可执行文件的page cache,则加入活跃链表。@ 除了上述三种情况,其余情况继续保留在不活跃链表。(2) 如果没有访问引用pte,则表示可以尝试回收*/if (!force_reclaim)references = page_check_references(page, sc);switch (references) {case PAGEREF_ACTIVATE:goto activate_locked;case PAGEREF_KEEP:goto keep_locked;case PAGEREF_RECLAIM:case PAGEREF_RECLAIM_CLEAN:; /* try to reclaim the page below */}/** Anonymous process memory has backing store?* Try to allocate it some swap space here.*//*!PageSwapCache(page) 说明page还没有分配交换空间(swap space),那么调用add_to_swap()为其分配交换空间,并且设置该页的标志位PG_swapcache*/if (PageAnon(page) && !PageSwapCache(page)) {if (!(sc->gfp_mask & __GFP_IO))goto keep_locked;if (!add_to_swap(page, page_list))goto activate_locked;may_enter_fs = 1;/* Adding to swap updated mapping */mapping = page_mapping(page);}/** The page is mapped into the page tables of one or more* processes. Try to unmap it here.*//*page有多个用户映射(page->_mapcount >= 0) 且mapping指向address_space,那么调用try_to_unmap()来解除这些用户映射的PTEs。函数返回SWAP_FAIL,说明解除pte失败,该页将迁移到活跃LRU中。返回SWAP_AGAIN,说明有的pte被漏掉了,保留在不活跃LRU链表中,下一次继续扫描。返回SWAP_SUCCESS,说明已经成功解除了所有PTEs映射了。*/if (page_mapped(page) && mapping) {switch (try_to_unmap(page, ttu_flags)) {case SWAP_FAIL:goto activate_locked;case SWAP_AGAIN:goto keep_locked;case SWAP_MLOCK:goto cull_mlocked;case SWAP_SUCCESS:; /* try to free the page below */}}/*处理page是dirty的情况。(1) 如果是文件映射页面,则设置page为PG_reclaim且继续保持在不活跃LRU中。在kswapd内核线程中进行一个页面的回写的做法不可取,早期的linux内核这样做事因为向存储设备中回写页面内容的速度比CPU慢很多个数量级。目前的做法是kswapd内核线程不会对零星的几个page cache页面进行回写,除非遇到之前有很多还没有开始回写的脏页面。当扫描完一轮后,发现有好多脏的page cache还没有来得及加入到回写子系统中(writeback subsystem),那么设置ZONE_DIRTY比特位,表示kswapd可以回写脏页面,否则一般情况下kswapd不回写脏的page cache。(2) 如果是匿名页面,那么调用pageout()函数进行写入交换分区。pageout()函数有4个返回值,PAGE_KEEP表示回写page失败,PAGE_ACTIVE表示page需要迁移回到活跃LRU链表中,PAGE_SUCCESS表示page已经成功写入存储设备,PAGE_CLEAN表示page已经干净,可以被释放了。*/if (PageDirty(page)) {/** Only kswapd can writeback filesystem pages to* avoid risk of stack overflow but only writeback* if many dirty pages have been encountered.*/if (page_is_file_cache(page) &&(!current_is_kswapd() ||!test_bit(ZONE_DIRTY, &zone->flags))) {/** Immediately reclaim when written back.* Similar in principal to deactivate_page()* except we already have the page isolated* and know it's dirty*/inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);SetPageReclaim(page);goto keep_locked;}if (references == PAGEREF_RECLAIM_CLEAN)goto keep_locked;if (!may_enter_fs)goto keep_locked;if (!sc->may_writepage)goto keep_locked;/* Page is dirty, try to write it out here */switch (pageout(page, mapping, sc)) {case PAGE_KEEP:goto keep_locked;case PAGE_ACTIVATE:goto activate_locked;case PAGE_SUCCESS:if (PageWriteback(page))goto keep;if (PageDirty(page))goto keep;/** A synchronous write - probably a ramdisk. Go* ahead and try to reclaim the page.*/if (!trylock_page(page))goto keep;if (PageDirty(page) || PageWriteback(page))goto keep_locked;mapping = page_mapping(page);case PAGE_CLEAN:; /* try to free the page below */}}/** If the page has buffers, try to free the buffer mappings* associated with this page. If we succeed we try to free* the page as well.** We do this even if the page is PageDirty().* try_to_release_page() does not perform I/O, but it is* possible for a page to have PageDirty set, but it is actually* clean (all its buffers are clean). This happens if the* buffers were written out directly, with submit_bh(). ext3* will do this, as well as the blockdev mapping.* try_to_release_page() will discover that cleanness and will* drop the buffers and mark the page clean - it can be freed.** Rarely, pages can have buffers and no ->mapping. These are* the pages which were not successfully invalidated in* truncate_complete_page(). We try to drop those buffers here* and if that worked, and the page is no longer mapped into* process address space (page_count == 1) it can be freed.* Otherwise, leave the page on the LRU so it is swappable.*//*处理page被用于块设备的buffer_head缓存,try_to_release_page()释放buffer_head缓存*/if (page_has_private(page)) {if (!try_to_release_page(page, sc->gfp_mask))goto activate_locked;if (!mapping && page_count(page) == 1) {unlock_page(page);if (put_page_testzero(page))goto free_it;else {/** rare race with speculative reference.* the speculative reference will free* this page shortly, so we may* increment nr_reclaimed here (and* leave it off the LRU).*/nr_reclaimed++;continue;}}}/*__remove_mapping()尝试分离page->mapping。程序运行到这里,说明page已经完成了大部分回收的工作,首先会妥善处理page的_count引用计数,见page_freeze_refs()函数。对于匿名页面,即PG_SwapCache有置位的页面,__delete_from_swap_cache()处理swap cache相关问题。对于 page cache,调用__delete_from_cache()和mapping->a_ops->freepage()处理相关问题。*/if (!mapping || !__remove_mapping(mapping, page, true))goto keep_locked;/** At this point, we have no other references and there is* no way to pick any more up (removed from LRU, removed* from pagecache). Can use non-atomic bitops now (and* we obviously don't have to worry about waking up a process* waiting on the page lock, because there are no references.*//*清除page的PG_lock锁*/__clear_page_locked(page);/*free_it 标签统计已经回收好的页面数量nr_reclaimed,将这些要释放的页面加入free_pages链表中。*/
free_it:nr_reclaimed++;/** Is there need to periodically free_page_list? It would* appear not as the counts should be low*/list_add(&page->lru, &free_pages);continue;cull_mlocked:if (PageSwapCache(page))try_to_free_swap(page);unlock_page(page);putback_lru_page(page);continue;/*activate_locked标签处表示页面不能回收,需要重新返回活跃LRU链表*/
activate_locked:/* Not a candidate for swapping, so reclaim swap space. */if (PageSwapCache(page) && vm_swap_full())try_to_free_swap(page);VM_BUG_ON_PAGE(PageActive(page), page);SetPageActive(page);pgactivate++;
keep_locked:unlock_page(page);/*keep标签处表示页面继续保持在不活跃LRU链表中。*/
keep:list_add(&page->lru, &ret_pages);VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);}mem_cgroup_uncharge_list(&free_pages);free_hot_cold_page_list(&free_pages, true);list_splice(&ret_pages, page_list);count_vm_events(PGACTIVATE, pgactivate);*ret_nr_dirty += nr_dirty;*ret_nr_congested += nr_congested;*ret_nr_unqueued_dirty += nr_unqueued_dirty;*ret_nr_writeback += nr_writeback;*ret_nr_immediate += nr_immediate;return nr_reclaimed;
}
本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!
