kernel: backport MGLRU to linux 6.0 for Tianling Shen

2025-04-16 04:13:31 +00:00 · 2022-10-16 02:16:00 +08:00 · 2022-10-16 02:16:00 +08:00 · 351d4bb63b
commit 351d4bb63b
parent 415a25a683
14 changed files with 7089 additions and 0 deletions
--- a/target/linux/generic/backport-6.0/100-mm-x86-arm64-add-arch_has_hw_pte_young.patch
+++ b/target/linux/generic/backport-6.0/100-mm-x86-arm64-add-arch_has_hw_pte_young.patch
@ -0,0 +1,154 @@
 From e3264035bdac67898d685423ffb2f3a9c3a5964a Mon Sep 17 00:00:00 2001
 From: Yu Zhao <yuzhao@google.com>
 Date: Wed, 4 Aug 2021 01:31:34 -0600
 Subject: [PATCH 01/14] mm: x86, arm64: add arch_has_hw_pte_young()
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 Some architectures automatically set the accessed bit in PTEs, e.g.,
 x86 and arm64 v8.2. On architectures that do not have this capability,
 clearing the accessed bit in a PTE usually triggers a page fault
 following the TLB miss of this PTE (to emulate the accessed bit).
 Being aware of this capability can help make better decisions, e.g.,
 whether to spread the work out over a period of time to reduce bursty
 page faults when trying to clear the accessed bit in many PTEs.
 Note that theoretically this capability can be unreliable, e.g.,
 hotplugged CPUs might be different from builtin ones. Therefore it
 should not be used in architecture-independent code that involves
 correctness, e.g., to determine whether TLB flushes are required (in
 combination with the accessed bit).
 Signed-off-by: Yu Zhao <yuzhao@google.com>
 Reviewed-by: Barry Song <baohua@kernel.org>
 Acked-by: Brian Geffon <bgeffon@google.com>
 Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
 Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
 Acked-by: Steven Barrett <steven@liquorix.net>
 Acked-by: Suleiman Souhlal <suleiman@google.com>
 Acked-by: Will Deacon <will@kernel.org>
 Tested-by: Daniel Byrne <djbyrne@mtu.edu>
 Tested-by: Donald Carr <d@chaos-reins.com>
 Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
 Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
 Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
 Tested-by: Sofia Trinh <sofia.trinh@edi.works>
 Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
 Change-Id: Ib49b44fb56df3333a2ff1fcc496fb1980b976e7a
 ---
 arch/arm64/include/asm/pgtable.h | 15 ++-------------
 arch/x86/include/asm/pgtable.h   |  6 +++---
 include/linux/pgtable.h          | 13 +++++++++++++
 mm/memory.c                      | 14 +-------------
 4 files changed, 19 insertions(+), 29 deletions(-)
 diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
 index b5df82aa99e6..71a1af42f0e8 100644
 --- a/arch/arm64/include/asm/pgtable.h
 +++ b/arch/arm64/include/asm/pgtable.h
@@ -1082,24 +1082,13 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
  * page after fork() + CoW for pfn mappings. We don't always have a
  * hardware-managed access flag on arm64.
  */
 -static inline bool arch_faults_on_old_pte(void)
 -{
 -	/* The register read below requires a stable CPU to make any sense */
 -	cant_migrate();
 -
 -	return !cpu_has_hw_af();
 -}
 -#define arch_faults_on_old_pte		arch_faults_on_old_pte
 +#define arch_has_hw_pte_young		cpu_has_hw_af
 /*
  * Experimentally, it's cheap to set the access flag in hardware and we
  * benefit from prefaulting mappings as 'old' to start with.
  */
 -static inline bool arch_wants_old_prefaulted_pte(void)
 -{
 -	return !arch_faults_on_old_pte();
 -}
 -#define arch_wants_old_prefaulted_pte	arch_wants_old_prefaulted_pte
 +#define arch_wants_old_prefaulted_pte	cpu_has_hw_af
 static inline bool pud_sect_supported(void)
 {
 diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
 index 44e2d6f1dbaa..dc5f7d8ef68a 100644
 --- a/arch/x86/include/asm/pgtable.h
 +++ b/arch/x86/include/asm/pgtable.h
@@ -1431,10 +1431,10 @@ static inline bool arch_has_pfn_modify_check(void)
 	return boot_cpu_has_bug(X86_BUG_L1TF);
 }
 -#define arch_faults_on_old_pte arch_faults_on_old_pte
 -static inline bool arch_faults_on_old_pte(void)
 +#define arch_has_hw_pte_young arch_has_hw_pte_young
 +static inline bool arch_has_hw_pte_young(void)
 {
 -	return false;
 +	return true;
 }
 #ifdef CONFIG_PAGE_TABLE_CHECK
 diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
 index 014ee8f0fbaa..95f408df4695 100644
 --- a/include/linux/pgtable.h
 +++ b/include/linux/pgtable.h
@@ -260,6 +260,19 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif
 +#ifndef arch_has_hw_pte_young
 +/*
 + * Return whether the accessed bit is supported on the local CPU.
 + *
 + * This stub assumes accessing through an old PTE triggers a page fault.
 + * Architectures that automatically set the access bit should overwrite it.
 + */
 +static inline bool arch_has_hw_pte_young(void)
 +{
 +	return false;
 +}
 +#endif
 +
 #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 				       unsigned long address,
 diff --git a/mm/memory.c b/mm/memory.c
 index a78814413ac0..68294ce1cb06 100644
 --- a/mm/memory.c
 +++ b/mm/memory.c
@@ -125,18 +125,6 @@ int randomize_va_space __read_mostly =
 					2;
 #endif
 -#ifndef arch_faults_on_old_pte
 -static inline bool arch_faults_on_old_pte(void)
 -{
 -	/*
 -	 * Those arches which don't have hw access flag feature need to
 -	 * implement their own helper. By default, "true" means pagefault
 -	 * will be hit on old pte.
 -	 */
 -	return true;
 -}
 -#endif
 -
 #ifndef arch_wants_old_prefaulted_pte
 static inline bool arch_wants_old_prefaulted_pte(void)
 {
@@ -2870,7 +2858,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
 	 * On architectures with software "accessed" bits, we would
 	 * take a double page fault, so mark it accessed here.
 	 */
 -	if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
 +	if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
 		pte_t entry;
 		vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
 -- 
 2.17.1
--- a/target/linux/generic/backport-6.0/101-mm-x86-add-CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.patch
+++ b/target/linux/generic/backport-6.0/101-mm-x86-add-CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.patch
@ -0,0 +1,145 @@
 From 0c0016e6f53b52166fe4da61c81fa6b27f4650cd Mon Sep 17 00:00:00 2001
 From: Yu Zhao <yuzhao@google.com>
 Date: Sat, 26 Sep 2020 21:17:18 -0600
 Subject: [PATCH 02/14] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 Some architectures support the accessed bit in non-leaf PMD entries,
 e.g., x86 sets the accessed bit in a non-leaf PMD entry when using it
 as part of linear address translation [1]. Page table walkers that
 clear the accessed bit may use this capability to reduce their search
 space.
 Note that:
 1. Although an inline function is preferable, this capability is added
   as a configuration option for consistency with the existing macros.
 2. Due to the little interest in other varieties, this capability was
   only tested on Intel and AMD CPUs.
 Thanks to the following developers for their efforts [2][3].
  Randy Dunlap <rdunlap@infradead.org>
  Stephen Rothwell <sfr@canb.auug.org.au>
 [1]: Intel 64 and IA-32 Architectures Software Developer's Manual
     Volume 3 (June 2021), section 4.8
 [2] https://lore.kernel.org/r/bfdcc7c8-922f-61a9-aa15-7e7250f04af7@infradead.org/
 [3] https://lore.kernel.org/r/20220413151513.5a0d7a7e@canb.auug.org.au/
 Signed-off-by: Yu Zhao <yuzhao@google.com>
 Reviewed-by: Barry Song <baohua@kernel.org>
 Acked-by: Brian Geffon <bgeffon@google.com>
 Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
 Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
 Acked-by: Steven Barrett <steven@liquorix.net>
 Acked-by: Suleiman Souhlal <suleiman@google.com>
 Tested-by: Daniel Byrne <djbyrne@mtu.edu>
 Tested-by: Donald Carr <d@chaos-reins.com>
 Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
 Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
 Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
 Tested-by: Sofia Trinh <sofia.trinh@edi.works>
 Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
 Change-Id: I1a17be3ae926f721f7b17ea1539e5c39e8c4f9a8
 ---
 arch/Kconfig                   | 8 ++++++++
 arch/x86/Kconfig               | 1 +
 arch/x86/include/asm/pgtable.h | 3 ++-
 arch/x86/mm/pgtable.c          | 5 ++++-
 include/linux/pgtable.h        | 4 ++--
 5 files changed, 17 insertions(+), 4 deletions(-)
 diff --git a/arch/Kconfig b/arch/Kconfig
 index 8b311e400ec1..bf19a84fffa2 100644
 --- a/arch/Kconfig
 +++ b/arch/Kconfig
@@ -1418,6 +1418,14 @@ config DYNAMIC_SIGFRAME
 config HAVE_ARCH_NODE_DEV_GROUP
 	bool
 +config ARCH_HAS_NONLEAF_PMD_YOUNG
 +	bool
 +	help
 +	  Architectures that select this option are capable of setting the
 +	  accessed bit in non-leaf PMD entries when using them as part of linear
 +	  address translations. Page table walkers that clear the accessed bit
 +	  may use this capability to reduce their search space.
 +
 source "kernel/gcov/Kconfig"
 source "scripts/gcc-plugins/Kconfig"
 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
 index f9920f1341c8..674d694a665e 100644
 --- a/arch/x86/Kconfig
 +++ b/arch/x86/Kconfig
@@ -85,6 +85,7 @@ config X86
 	select ARCH_HAS_PMEM_API		if X86_64
 	select ARCH_HAS_PTE_DEVMAP		if X86_64
 	select ARCH_HAS_PTE_SPECIAL
 +	select ARCH_HAS_NONLEAF_PMD_YOUNG	if PGTABLE_LEVELS > 2
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64
 	select ARCH_HAS_COPY_MC			if X86_64
 	select ARCH_HAS_SET_MEMORY
 diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
 index dc5f7d8ef68a..5059799bebe3 100644
 --- a/arch/x86/include/asm/pgtable.h
 +++ b/arch/x86/include/asm/pgtable.h
@@ -815,7 +815,8 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
 static inline int pmd_bad(pmd_t pmd)
 {
 -	return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
 +	return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) !=
 +	       (_KERNPG_TABLE & ~_PAGE_ACCESSED);
 }
 static inline unsigned long pages_to_mb(unsigned long npg)
 diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
 index a932d7712d85..8525f2876fb4 100644
 --- a/arch/x86/mm/pgtable.c
 +++ b/arch/x86/mm/pgtable.c
@@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
 	return ret;
 }
 -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
 int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 			      unsigned long addr, pmd_t *pmdp)
 {
@@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 	return ret;
 }
 +#endif
 +
 +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 int pudp_test_and_clear_young(struct vm_area_struct *vma,
 			      unsigned long addr, pud_t *pudp)
 {
 diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
 index 95f408df4695..d9095251bffd 100644
 --- a/include/linux/pgtable.h
 +++ b/include/linux/pgtable.h
@@ -213,7 +213,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
 #endif
 #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
 -#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
 static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 					    unsigned long address,
 					    pmd_t *pmdp)
@@ -234,7 +234,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 	BUILD_BUG();
 	return 0;
 }
 -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
 #endif
 #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
 -- 
 2.17.1
--- a/target/linux/generic/backport-6.0/102-mm-vmscan.c-refactor-shrink_node.patch
+++ b/target/linux/generic/backport-6.0/102-mm-vmscan.c-refactor-shrink_node.patch
@ -0,0 +1,259 @@
 From d8e0edcddc441574410a047ede56f79c849a6d37 Mon Sep 17 00:00:00 2001
 From: Yu Zhao <yuzhao@google.com>
 Date: Sun, 27 Sep 2020 20:49:08 -0600
 Subject: [PATCH 03/14] mm/vmscan.c: refactor shrink_node()
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 This patch refactors shrink_node() to improve readability for the
 upcoming changes to mm/vmscan.c.
 Signed-off-by: Yu Zhao <yuzhao@google.com>
 Reviewed-by: Barry Song <baohua@kernel.org>
 Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
 Acked-by: Brian Geffon <bgeffon@google.com>
 Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
 Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
 Acked-by: Steven Barrett <steven@liquorix.net>
 Acked-by: Suleiman Souhlal <suleiman@google.com>
 Tested-by: Daniel Byrne <djbyrne@mtu.edu>
 Tested-by: Donald Carr <d@chaos-reins.com>
 Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
 Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
 Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
 Tested-by: Sofia Trinh <sofia.trinh@edi.works>
 Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
 Change-Id: Iae734b5b4030205b7db6e8c841f747b6f6ae1a04
 ---
 mm/vmscan.c | 198 +++++++++++++++++++++++++++-------------------------
 1 file changed, 104 insertions(+), 94 deletions(-)
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 index 382dbe97329f..4e4331367db9 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
@@ -2728,6 +2728,109 @@ enum scan_balance {
 	SCAN_FILE,
 };
 +static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
 +{
 +	unsigned long file;
 +	struct lruvec *target_lruvec;
 +
 +	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
 +
 +	/*
 +	 * Flush the memory cgroup stats, so that we read accurate per-memcg
 +	 * lruvec stats for heuristics.
 +	 */
 +	mem_cgroup_flush_stats();
 +
 +	/*
 +	 * Determine the scan balance between anon and file LRUs.
 +	 */
 +	spin_lock_irq(&target_lruvec->lru_lock);
 +	sc->anon_cost = target_lruvec->anon_cost;
 +	sc->file_cost = target_lruvec->file_cost;
 +	spin_unlock_irq(&target_lruvec->lru_lock);
 +
 +	/*
 +	 * Target desirable inactive:active list ratios for the anon
 +	 * and file LRU lists.
 +	 */
 +	if (!sc->force_deactivate) {
 +		unsigned long refaults;
 +
 +		refaults = lruvec_page_state(target_lruvec,
 +				WORKINGSET_ACTIVATE_ANON);
 +		if (refaults != target_lruvec->refaults[0] ||
 +			inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
 +			sc->may_deactivate |= DEACTIVATE_ANON;
 +		else
 +			sc->may_deactivate &= ~DEACTIVATE_ANON;
 +
 +		/*
 +		 * When refaults are being observed, it means a new
 +		 * workingset is being established. Deactivate to get
 +		 * rid of any stale active pages quickly.
 +		 */
 +		refaults = lruvec_page_state(target_lruvec,
 +				WORKINGSET_ACTIVATE_FILE);
 +		if (refaults != target_lruvec->refaults[1] ||
 +		    inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
 +			sc->may_deactivate |= DEACTIVATE_FILE;
 +		else
 +			sc->may_deactivate &= ~DEACTIVATE_FILE;
 +	} else
 +		sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
 +
 +	/*
 +	 * If we have plenty of inactive file pages that aren't
 +	 * thrashing, try to reclaim those first before touching
 +	 * anonymous pages.
 +	 */
 +	file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
 +	if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
 +		sc->cache_trim_mode = 1;
 +	else
 +		sc->cache_trim_mode = 0;
 +
 +	/*
 +	 * Prevent the reclaimer from falling into the cache trap: as
 +	 * cache pages start out inactive, every cache fault will tip
 +	 * the scan balance towards the file LRU.  And as the file LRU
 +	 * shrinks, so does the window for rotation from references.
 +	 * This means we have a runaway feedback loop where a tiny
 +	 * thrashing file LRU becomes infinitely more attractive than
 +	 * anon pages.  Try to detect this based on file LRU size.
 +	 */
 +	if (!cgroup_reclaim(sc)) {
 +		unsigned long total_high_wmark = 0;
 +		unsigned long free, anon;
 +		int z;
 +
 +		free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
 +		file = node_page_state(pgdat, NR_ACTIVE_FILE) +
 +			   node_page_state(pgdat, NR_INACTIVE_FILE);
 +
 +		for (z = 0; z < MAX_NR_ZONES; z++) {
 +			struct zone *zone = &pgdat->node_zones[z];
 +
 +			if (!managed_zone(zone))
 +				continue;
 +
 +			total_high_wmark += high_wmark_pages(zone);
 +		}
 +
 +		/*
 +		 * Consider anon: if that's low too, this isn't a
 +		 * runaway file reclaim problem, but rather just
 +		 * extreme pressure. Reclaim as per usual then.
 +		 */
 +		anon = node_page_state(pgdat, NR_INACTIVE_ANON);
 +
 +		sc->file_is_tiny =
 +			file + free <= total_high_wmark &&
 +			!(sc->may_deactivate & DEACTIVATE_ANON) &&
 +			anon >> sc->priority;
 +	}
 +}
 +
 /*
  * Determine how aggressively the anon and file LRU lists should be
  * scanned.
@@ -3197,109 +3300,16 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 	unsigned long nr_reclaimed, nr_scanned;
 	struct lruvec *target_lruvec;
 	bool reclaimable = false;
 -	unsigned long file;
 	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
 again:
 -	/*
 -	 * Flush the memory cgroup stats, so that we read accurate per-memcg
 -	 * lruvec stats for heuristics.
 -	 */
 -	mem_cgroup_flush_stats();
 -
 	memset(&sc->nr, 0, sizeof(sc->nr));
 	nr_reclaimed = sc->nr_reclaimed;
 	nr_scanned = sc->nr_scanned;
 -	/*
 -	 * Determine the scan balance between anon and file LRUs.
 -	 */
 -	spin_lock_irq(&target_lruvec->lru_lock);
 -	sc->anon_cost = target_lruvec->anon_cost;
 -	sc->file_cost = target_lruvec->file_cost;
 -	spin_unlock_irq(&target_lruvec->lru_lock);
 -
 -	/*
 -	 * Target desirable inactive:active list ratios for the anon
 -	 * and file LRU lists.
 -	 */
 -	if (!sc->force_deactivate) {
 -		unsigned long refaults;
 -
 -		refaults = lruvec_page_state(target_lruvec,
 -				WORKINGSET_ACTIVATE_ANON);
 -		if (refaults != target_lruvec->refaults[0] ||
 -			inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
 -			sc->may_deactivate |= DEACTIVATE_ANON;
 -		else
 -			sc->may_deactivate &= ~DEACTIVATE_ANON;
 -
 -		/*
 -		 * When refaults are being observed, it means a new
 -		 * workingset is being established. Deactivate to get
 -		 * rid of any stale active pages quickly.
 -		 */
 -		refaults = lruvec_page_state(target_lruvec,
 -				WORKINGSET_ACTIVATE_FILE);
 -		if (refaults != target_lruvec->refaults[1] ||
 -		    inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
 -			sc->may_deactivate |= DEACTIVATE_FILE;
 -		else
 -			sc->may_deactivate &= ~DEACTIVATE_FILE;
 -	} else
 -		sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
 -
 -	/*
 -	 * If we have plenty of inactive file pages that aren't
 -	 * thrashing, try to reclaim those first before touching
 -	 * anonymous pages.
 -	 */
 -	file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
 -	if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
 -		sc->cache_trim_mode = 1;
 -	else
 -		sc->cache_trim_mode = 0;
 -
 -	/*
 -	 * Prevent the reclaimer from falling into the cache trap: as
 -	 * cache pages start out inactive, every cache fault will tip
 -	 * the scan balance towards the file LRU.  And as the file LRU
 -	 * shrinks, so does the window for rotation from references.
 -	 * This means we have a runaway feedback loop where a tiny
 -	 * thrashing file LRU becomes infinitely more attractive than
 -	 * anon pages.  Try to detect this based on file LRU size.
 -	 */
 -	if (!cgroup_reclaim(sc)) {
 -		unsigned long total_high_wmark = 0;
 -		unsigned long free, anon;
 -		int z;
 -
 -		free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
 -		file = node_page_state(pgdat, NR_ACTIVE_FILE) +
 -			   node_page_state(pgdat, NR_INACTIVE_FILE);
 -
 -		for (z = 0; z < MAX_NR_ZONES; z++) {
 -			struct zone *zone = &pgdat->node_zones[z];
 -			if (!managed_zone(zone))
 -				continue;
 -
 -			total_high_wmark += high_wmark_pages(zone);
 -		}
 -
 -		/*
 -		 * Consider anon: if that's low too, this isn't a
 -		 * runaway file reclaim problem, but rather just
 -		 * extreme pressure. Reclaim as per usual then.
 -		 */
 -		anon = node_page_state(pgdat, NR_INACTIVE_ANON);
 -
 -		sc->file_is_tiny =
 -			file + free <= total_high_wmark &&
 -			!(sc->may_deactivate & DEACTIVATE_ANON) &&
 -			anon >> sc->priority;
 -	}
 +	prepare_scan_count(pgdat, sc);
 	shrink_node_memcgs(pgdat, sc);
 -- 
 2.17.1
--- a/target/linux/generic/backport-6.0/103-Revert-include-linux-mm_inline.h-fold-__update_lru_s.patch
+++ b/target/linux/generic/backport-6.0/103-Revert-include-linux-mm_inline.h-fold-__update_lru_s.patch
@ -0,0 +1,64 @@
 From bc14d2c7c6d0fb8c79ad0fc5eab488b977cbcccf Mon Sep 17 00:00:00 2001
 From: Yu Zhao <yuzhao@google.com>
 Date: Sun, 6 Mar 2022 20:22:40 -0700
 Subject: [PATCH 04/14] Revert "include/linux/mm_inline.h: fold
 __update_lru_size() into its sole caller"
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 This patch undoes the following refactor:
 commit 289ccba18af4 ("include/linux/mm_inline.h: fold __update_lru_size() into its sole caller")
 The upcoming changes to include/linux/mm_inline.h will reuse
 __update_lru_size().
 Signed-off-by: Yu Zhao <yuzhao@google.com>
 Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
 Acked-by: Brian Geffon <bgeffon@google.com>
 Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
 Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
 Acked-by: Steven Barrett <steven@liquorix.net>
 Acked-by: Suleiman Souhlal <suleiman@google.com>
 Tested-by: Daniel Byrne <djbyrne@mtu.edu>
 Tested-by: Donald Carr <d@chaos-reins.com>
 Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
 Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
 Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
 Tested-by: Sofia Trinh <sofia.trinh@edi.works>
 Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
 Change-Id: I6155c407d50199a43b179c7f45904d4b7c052118
 ---
 include/linux/mm_inline.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)
 diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
 index 7b25b53c474a..fb8aadb81cd6 100644
 --- a/include/linux/mm_inline.h
 +++ b/include/linux/mm_inline.h
@@ -34,7 +34,7 @@ static inline int page_is_file_lru(struct page *page)
 	return folio_is_file_lru(page_folio(page));
 }
 -static __always_inline void update_lru_size(struct lruvec *lruvec,
 +static __always_inline void __update_lru_size(struct lruvec *lruvec,
 				enum lru_list lru, enum zone_type zid,
 				long nr_pages)
 {
@@ -43,6 +43,13 @@ static __always_inline void update_lru_size(struct lruvec *lruvec,
 	__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
 	__mod_zone_page_state(&pgdat->node_zones[zid],
 				NR_ZONE_LRU_BASE + lru, nr_pages);
 +}
 +
 +static __always_inline void update_lru_size(struct lruvec *lruvec,
 +				enum lru_list lru, enum zone_type zid,
 +				long nr_pages)
 +{
 +	__update_lru_size(lruvec, lru, zid, nr_pages);
 #ifdef CONFIG_MEMCG
 	mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
 #endif
 -- 
 2.17.1
--- a/target/linux/generic/backport-6.0/104-mm-multi-gen-LRU-groundwork.patch
+++ b/target/linux/generic/backport-6.0/104-mm-multi-gen-LRU-groundwork.patch
@ -0,0 +1,810 @@
 From 8c6beb4548c216da9dae5e1a7612a108396e3f9e Mon Sep 17 00:00:00 2001
 From: Yu Zhao <yuzhao@google.com>
 Date: Mon, 25 Jan 2021 21:12:33 -0700
 Subject: [PATCH 05/14] mm: multi-gen LRU: groundwork
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 Evictable pages are divided into multiple generations for each lruvec.
 The youngest generation number is stored in lrugen->max_seq for both
 anon and file types as they are aged on an equal footing. The oldest
 generation numbers are stored in lrugen->min_seq[] separately for anon
 and file types as clean file pages can be evicted regardless of swap
 constraints. These three variables are monotonically increasing.
 Generation numbers are truncated into order_base_2(MAX_NR_GENS+1) bits
 in order to fit into the gen counter in folio->flags. Each truncated
 generation number is an index to lrugen->lists[]. The sliding window
 technique is used to track at least MIN_NR_GENS and at most
 MAX_NR_GENS generations. The gen counter stores a value within [1,
 MAX_NR_GENS] while a page is on one of lrugen->lists[]. Otherwise it
 stores 0.
 There are two conceptually independent procedures: "the aging", which
 produces young generations, and "the eviction", which consumes old
 generations. They form a closed-loop system, i.e., "the page reclaim".
 Both procedures can be invoked from userspace for the purposes of
 working set estimation and proactive reclaim. These techniques are
 commonly used to optimize job scheduling (bin packing) in data
 centers [1][2].
 To avoid confusion, the terms "hot" and "cold" will be applied to the
 multi-gen LRU, as a new convention; the terms "active" and "inactive"
 will be applied to the active/inactive LRU, as usual.
 The protection of hot pages and the selection of cold pages are based
 on page access channels and patterns. There are two access channels:
 one through page tables and the other through file descriptors. The
 protection of the former channel is by design stronger because:
 1. The uncertainty in determining the access patterns of the former
   channel is higher due to the approximation of the accessed bit.
 2. The cost of evicting the former channel is higher due to the TLB
   flushes required and the likelihood of encountering the dirty bit.
 3. The penalty of underprotecting the former channel is higher because
   applications usually do not prepare themselves for major page
   faults like they do for blocked I/O. E.g., GUI applications
   commonly use dedicated I/O threads to avoid blocking rendering
   threads.
 There are also two access patterns: one with temporal locality and the
 other without. For the reasons listed above, the former channel is
 assumed to follow the former pattern unless VM_SEQ_READ or
 VM_RAND_READ is present; the latter channel is assumed to follow the
 latter pattern unless outlying refaults have been observed [3][4].
 The next patch will address the "outlying refaults". Three macros,
 i.e., LRU_REFS_WIDTH, LRU_REFS_PGOFF and LRU_REFS_MASK, used later are
 added in this patch to make the entire patchset less diffy.
 A page is added to the youngest generation on faulting. The aging
 needs to check the accessed bit at least twice before handing this
 page over to the eviction. The first check takes care of the accessed
 bit set on the initial fault; the second check makes sure this page
 has not been used since then. This protocol, AKA second chance,
 requires a minimum of two generations, hence MIN_NR_GENS.
 [1] https://dl.acm.org/doi/10.1145/3297858.3304053
 [2] https://dl.acm.org/doi/10.1145/3503222.3507731
 [3] https://lwn.net/Articles/495543/
 [4] https://lwn.net/Articles/815342/
 Signed-off-by: Yu Zhao <yuzhao@google.com>
 Acked-by: Brian Geffon <bgeffon@google.com>
 Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
 Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
 Acked-by: Steven Barrett <steven@liquorix.net>
 Acked-by: Suleiman Souhlal <suleiman@google.com>
 Tested-by: Daniel Byrne <djbyrne@mtu.edu>
 Tested-by: Donald Carr <d@chaos-reins.com>
 Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
 Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
 Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
 Tested-by: Sofia Trinh <sofia.trinh@edi.works>
 Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
 Change-Id: I71de7cd15b8dfa6f9fdd838023474693c4fee0a7
 ---
 fs/fuse/dev.c                     |   3 +-
 include/linux/mm_inline.h         | 175 ++++++++++++++++++++++++++++++
 include/linux/mmzone.h            | 102 +++++++++++++++++
 include/linux/page-flags-layout.h |  13 ++-
 include/linux/page-flags.h        |   4 +-
 include/linux/sched.h             |   4 +
 kernel/bounds.c                   |   5 +
 mm/Kconfig                        |   8 ++
 mm/huge_memory.c                  |   3 +-
 mm/memcontrol.c                   |   2 +
 mm/memory.c                       |  25 +++++
 mm/mm_init.c                      |   6 +-
 mm/mmzone.c                       |   2 +
 mm/swap.c                         |  11 +-
 mm/vmscan.c                       |  75 +++++++++++++
 15 files changed, 424 insertions(+), 14 deletions(-)
 diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
 index 51897427a534..b4a6e0a1b945 100644
 --- a/fs/fuse/dev.c
 +++ b/fs/fuse/dev.c
@@ -776,7 +776,8 @@ static int fuse_check_page(struct page *page)
 	       1 << PG_active |
 	       1 << PG_workingset |
 	       1 << PG_reclaim |
 -	       1 << PG_waiters))) {
 +	       1 << PG_waiters |
 +	       LRU_GEN_MASK | LRU_REFS_MASK))) {
 		dump_page(page, "fuse: trying to steal weird page");
 		return 1;
 	}
 diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
 index fb8aadb81cd6..2ff703900fd0 100644
 --- a/include/linux/mm_inline.h
 +++ b/include/linux/mm_inline.h
@@ -40,6 +40,9 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec,
 {
 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
 +	lockdep_assert_held(&lruvec->lru_lock);
 +	WARN_ON_ONCE(nr_pages != (int)nr_pages);
 +
 	__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
 	__mod_zone_page_state(&pgdat->node_zones[zid],
 				NR_ZONE_LRU_BASE + lru, nr_pages);
@@ -101,11 +104,177 @@ static __always_inline enum lru_list folio_lru_list(struct folio *folio)
 	return lru;
 }
 +#ifdef CONFIG_LRU_GEN
 +
 +static inline bool lru_gen_enabled(void)
 +{
 +	return true;
 +}
 +
 +static inline bool lru_gen_in_fault(void)
 +{
 +	return current->in_lru_fault;
 +}
 +
 +static inline int lru_gen_from_seq(unsigned long seq)
 +{
 +	return seq % MAX_NR_GENS;
 +}
 +
 +static inline int folio_lru_gen(struct folio *folio)
 +{
 +	unsigned long flags = READ_ONCE(folio->flags);
 +
 +	return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
 +}
 +
 +static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
 +{
 +	unsigned long max_seq = lruvec->lrugen.max_seq;
 +
 +	VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
 +
 +	/* see the comment on MIN_NR_GENS */
 +	return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
 +}
 +
 +static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio,
 +				       int old_gen, int new_gen)
 +{
 +	int type = folio_is_file_lru(folio);
 +	int zone = folio_zonenum(folio);
 +	int delta = folio_nr_pages(folio);
 +	enum lru_list lru = type * LRU_INACTIVE_FILE;
 +	struct lru_gen_struct *lrugen = &lruvec->lrugen;
 +
 +	VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
 +	VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
 +	VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1);
 +
 +	if (old_gen >= 0)
 +		WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone],
 +			   lrugen->nr_pages[old_gen][type][zone] - delta);
 +	if (new_gen >= 0)
 +		WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone],
 +			   lrugen->nr_pages[new_gen][type][zone] + delta);
 +
 +	/* addition */
 +	if (old_gen < 0) {
 +		if (lru_gen_is_active(lruvec, new_gen))
 +			lru += LRU_ACTIVE;
 +		__update_lru_size(lruvec, lru, zone, delta);
 +		return;
 +	}
 +
 +	/* deletion */
 +	if (new_gen < 0) {
 +		if (lru_gen_is_active(lruvec, old_gen))
 +			lru += LRU_ACTIVE;
 +		__update_lru_size(lruvec, lru, zone, -delta);
 +		return;
 +	}
 +}
 +
 +static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
 +{
 +	unsigned long seq;
 +	unsigned long flags;
 +	int gen = folio_lru_gen(folio);
 +	int type = folio_is_file_lru(folio);
 +	int zone = folio_zonenum(folio);
 +	struct lru_gen_struct *lrugen = &lruvec->lrugen;
 +
 +	VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
 +
 +	if (folio_test_unevictable(folio))
 +		return false;
 +	/*
 +	 * There are three common cases for this page:
 +	 * 1. If it's hot, e.g., freshly faulted in or previously hot and
 +	 *    migrated, add it to the youngest generation.
 +	 * 2. If it's cold but can't be evicted immediately, i.e., an anon page
 +	 *    not in swapcache or a dirty page pending writeback, add it to the
 +	 *    second oldest generation.
 +	 * 3. Everything else (clean, cold) is added to the oldest generation.
 +	 */
 +	if (folio_test_active(folio))
 +		seq = lrugen->max_seq;
 +	else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) ||
 +		 (folio_test_reclaim(folio) &&
 +		  (folio_test_dirty(folio) || folio_test_writeback(folio))))
 +		seq = lrugen->min_seq[type] + 1;
 +	else
 +		seq = lrugen->min_seq[type];
 +
 +	gen = lru_gen_from_seq(seq);
 +	flags = (gen + 1UL) << LRU_GEN_PGOFF;
 +	/* see the comment on MIN_NR_GENS about PG_active */
 +	set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags);
 +
 +	lru_gen_update_size(lruvec, folio, -1, gen);
 +	/* for folio_rotate_reclaimable() */
 +	if (reclaiming)
 +		list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]);
 +	else
 +		list_add(&folio->lru, &lrugen->lists[gen][type][zone]);
 +
 +	return true;
 +}
 +
 +static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
 +{
 +	unsigned long flags;
 +	int gen = folio_lru_gen(folio);
 +
 +	if (gen < 0)
 +		return false;
 +
 +	VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
 +	VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
 +
 +	/* for folio_migrate_flags() */
 +	flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
 +	flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags);
 +	gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
 +
 +	lru_gen_update_size(lruvec, folio, gen, -1);
 +	list_del(&folio->lru);
 +
 +	return true;
 +}
 +
 +#else /* !CONFIG_LRU_GEN */
 +
 +static inline bool lru_gen_enabled(void)
 +{
 +	return false;
 +}
 +
 +static inline bool lru_gen_in_fault(void)
 +{
 +	return false;
 +}
 +
 +static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
 +{
 +	return false;
 +}
 +
 +static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
 +{
 +	return false;
 +}
 +
 +#endif /* CONFIG_LRU_GEN */
 +
 static __always_inline
 void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio)
 {
 	enum lru_list lru = folio_lru_list(folio);
 +	if (lru_gen_add_folio(lruvec, folio, false))
 +		return;
 +
 	update_lru_size(lruvec, lru, folio_zonenum(folio),
 			folio_nr_pages(folio));
 	if (lru != LRU_UNEVICTABLE)
@@ -123,6 +292,9 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio)
 {
 	enum lru_list lru = folio_lru_list(folio);
 +	if (lru_gen_add_folio(lruvec, folio, true))
 +		return;
 +
 	update_lru_size(lruvec, lru, folio_zonenum(folio),
 			folio_nr_pages(folio));
 	/* This is not expected to be used on LRU_UNEVICTABLE */
@@ -140,6 +312,9 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
 {
 	enum lru_list lru = folio_lru_list(folio);
 +	if (lru_gen_del_folio(lruvec, folio, false))
 +		return;
 +
 	if (lru != LRU_UNEVICTABLE)
 		list_del(&folio->lru);
 	update_lru_size(lruvec, lru, folio_zonenum(folio),
 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
 index e24b40c52468..1abb6d38ed86 100644
 --- a/include/linux/mmzone.h
 +++ b/include/linux/mmzone.h
@@ -314,6 +314,102 @@ enum lruvec_flags {
 					 */
 };
 +#endif /* !__GENERATING_BOUNDS_H */
 +
 +/*
 + * Evictable pages are divided into multiple generations. The youngest and the
 + * oldest generation numbers, max_seq and min_seq, are monotonically increasing.
 + * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
 + * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
 + * corresponding generation. The gen counter in folio->flags stores gen+1 while
 + * a page is on one of lrugen->lists[]. Otherwise it stores 0.
 + *
 + * A page is added to the youngest generation on faulting. The aging needs to
 + * check the accessed bit at least twice before handing this page over to the
 + * eviction. The first check takes care of the accessed bit set on the initial
 + * fault; the second check makes sure this page hasn't been used since then.
 + * This process, AKA second chance, requires a minimum of two generations,
 + * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive
 + * LRU, e.g., /proc/vmstat, these two generations are considered active; the
 + * rest of generations, if they exist, are considered inactive. See
 + * lru_gen_is_active().
 + *
 + * PG_active is always cleared while a page is on one of lrugen->lists[] so that
 + * the aging needs not to worry about it. And it's set again when a page
 + * considered active is isolated for non-reclaiming purposes, e.g., migration.
 + * See lru_gen_add_folio() and lru_gen_del_folio().
 + *
 + * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
 + * number of categories of the active/inactive LRU when keeping track of
 + * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits
 + * in folio->flags.
 + */
 +#define MIN_NR_GENS		2U
 +#define MAX_NR_GENS		4U
 +
 +#ifndef __GENERATING_BOUNDS_H
 +
 +struct lruvec;
 +
 +#define LRU_GEN_MASK		((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
 +#define LRU_REFS_MASK		((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
 +
 +#ifdef CONFIG_LRU_GEN
 +
 +enum {
 +	LRU_GEN_ANON,
 +	LRU_GEN_FILE,
 +};
 +
 +/*
 + * The youngest generation number is stored in max_seq for both anon and file
 + * types as they are aged on an equal footing. The oldest generation numbers are
 + * stored in min_seq[] separately for anon and file types as clean file pages
 + * can be evicted regardless of swap constraints.
 + *
 + * Normally anon and file min_seq are in sync. But if swapping is constrained,
 + * e.g., out of swap space, file min_seq is allowed to advance and leave anon
 + * min_seq behind.
 + *
 + * The number of pages in each generation is eventually consistent and therefore
 + * can be transiently negative.
 + */
 +struct lru_gen_struct {
 +	/* the aging increments the youngest generation number */
 +	unsigned long max_seq;
 +	/* the eviction increments the oldest generation numbers */
 +	unsigned long min_seq[ANON_AND_FILE];
 +	/* the multi-gen LRU lists, lazily sorted on eviction */
 +	struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
 +	/* the multi-gen LRU sizes, eventually consistent */
 +	long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
 +};
 +
 +void lru_gen_init_lruvec(struct lruvec *lruvec);
 +
 +#ifdef CONFIG_MEMCG
 +void lru_gen_init_memcg(struct mem_cgroup *memcg);
 +void lru_gen_exit_memcg(struct mem_cgroup *memcg);
 +#endif
 +
 +#else /* !CONFIG_LRU_GEN */
 +
 +static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
 +{
 +}
 +
 +#ifdef CONFIG_MEMCG
 +static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
 +{
 +}
 +
 +static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
 +{
 +}
 +#endif
 +
 +#endif /* CONFIG_LRU_GEN */
 +
 struct lruvec {
 	struct list_head		lists[NR_LRU_LISTS];
 	/* per lruvec lru_lock for memcg */
@@ -331,6 +427,10 @@ struct lruvec {
 	unsigned long			refaults[ANON_AND_FILE];
 	/* Various lruvec state flags (enum lruvec_flags) */
 	unsigned long			flags;
 +#ifdef CONFIG_LRU_GEN
 +	/* evictable pages divided into generations */
 +	struct lru_gen_struct		lrugen;
 +#endif
 #ifdef CONFIG_MEMCG
 	struct pglist_data *pgdat;
 #endif
@@ -746,6 +846,8 @@ static inline bool zone_is_empty(struct zone *zone)
 #define ZONES_PGOFF		(NODES_PGOFF - ZONES_WIDTH)
 #define LAST_CPUPID_PGOFF	(ZONES_PGOFF - LAST_CPUPID_WIDTH)
 #define KASAN_TAG_PGOFF		(LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
 +#define LRU_GEN_PGOFF		(KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
 +#define LRU_REFS_PGOFF		(LRU_GEN_PGOFF - LRU_REFS_WIDTH)
 /*
  * Define the bit shifts to access each section.  For non-existent
 diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h
 index ef1e3e736e14..240905407a18 100644
 --- a/include/linux/page-flags-layout.h
 +++ b/include/linux/page-flags-layout.h
@@ -55,7 +55,8 @@
 #define SECTIONS_WIDTH		0
 #endif
 -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
 +#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
 +	<= BITS_PER_LONG - NR_PAGEFLAGS
 #define NODES_WIDTH		NODES_SHIFT
 #elif defined(CONFIG_SPARSEMEM_VMEMMAP)
 #error "Vmemmap: No space for nodes field in page flags"
@@ -89,8 +90,8 @@
 #define LAST_CPUPID_SHIFT 0
 #endif
 -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
 -	<= BITS_PER_LONG - NR_PAGEFLAGS
 +#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
 +	KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
 #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
 #else
 #define LAST_CPUPID_WIDTH 0
@@ -100,10 +101,12 @@
 #define LAST_CPUPID_NOT_IN_PAGE_FLAGS
 #endif
 -#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
 -	> BITS_PER_LONG - NR_PAGEFLAGS
 +#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
 +	KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
 #error "Not enough bits in page flags"
 #endif
 +#define LRU_REFS_WIDTH	0
 +
 #endif
 #endif /* _LINUX_PAGE_FLAGS_LAYOUT */
 diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
 index 465ff35a8c00..0b0ae5084e60 100644
 --- a/include/linux/page-flags.h
 +++ b/include/linux/page-flags.h
@@ -1058,7 +1058,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
 	 1UL << PG_private	| 1UL << PG_private_2	|	\
 	 1UL << PG_writeback	| 1UL << PG_reserved	|	\
 	 1UL << PG_slab		| 1UL << PG_active 	|	\
 -	 1UL << PG_unevictable	| __PG_MLOCKED)
 +	 1UL << PG_unevictable	| __PG_MLOCKED | LRU_GEN_MASK)
 /*
  * Flags checked when a page is prepped for return by the page allocator.
@@ -1069,7 +1069,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page)
  * alloc-free cycle to prevent from reusing the page.
  */
 #define PAGE_FLAGS_CHECK_AT_PREP	\
 -	(PAGEFLAGS_MASK & ~__PG_HWPOISON)
 +	((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
 #define PAGE_FLAGS_PRIVATE				\
 	(1UL << PG_private | 1UL << PG_private_2)
 diff --git a/include/linux/sched.h b/include/linux/sched.h
 index e7b2f8a5c711..8cc46a789193 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
@@ -914,6 +914,10 @@ struct task_struct {
 #ifdef CONFIG_MEMCG
 	unsigned			in_user_fault:1;
 #endif
 +#ifdef CONFIG_LRU_GEN
 +	/* whether the LRU algorithm may apply to this access */
 +	unsigned			in_lru_fault:1;
 +#endif
 #ifdef CONFIG_COMPAT_BRK
 	unsigned			brk_randomized:1;
 #endif
 diff --git a/kernel/bounds.c b/kernel/bounds.c
 index 9795d75b09b2..5ee60777d8e4 100644
 --- a/kernel/bounds.c
 +++ b/kernel/bounds.c
@@ -22,6 +22,11 @@ int main(void)
 	DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
 #endif
 	DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
 +#ifdef CONFIG_LRU_GEN
 +	DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
 +#else
 +	DEFINE(LRU_GEN_WIDTH, 0);
 +#endif
 	/* End of constants */
 	return 0;
 diff --git a/mm/Kconfig b/mm/Kconfig
 index 0331f1461f81..d95f07cd6dcf 100644
 --- a/mm/Kconfig
 +++ b/mm/Kconfig
@@ -1124,6 +1124,14 @@ config PTE_MARKER_UFFD_WP
 	  purposes.  It is required to enable userfaultfd write protection on
 	  file-backed memory types like shmem and hugetlbfs.
 +config LRU_GEN
 +	bool "Multi-Gen LRU"
 +	depends on MMU
 +	# make sure folio->flags has enough spare bits
 +	depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP
 +	help
 +	  A high performance LRU implementation to overcommit memory.
 +
 source "mm/damon/Kconfig"
 endmenu
 diff --git a/mm/huge_memory.c b/mm/huge_memory.c
 index f42bb51e023a..79e0b08b4cf9 100644
 --- a/mm/huge_memory.c
 +++ b/mm/huge_memory.c
@@ -2438,7 +2438,8 @@ static void __split_huge_page_tail(struct page *head, int tail,
 #ifdef CONFIG_64BIT
 			 (1L << PG_arch_2) |
 #endif
 -			 (1L << PG_dirty)));
 +			 (1L << PG_dirty) |
 +			 LRU_GEN_MASK | LRU_REFS_MASK));
 	/* ->mapping in first tail page is compound_mapcount */
 	VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index b69979c9ced5..5fd38d12149c 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
@@ -5170,6 +5170,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 static void mem_cgroup_free(struct mem_cgroup *memcg)
 {
 +	lru_gen_exit_memcg(memcg);
 	memcg_wb_domain_exit(memcg);
 	__mem_cgroup_free(memcg);
 }
@@ -5228,6 +5229,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 	memcg->deferred_split_queue.split_queue_len = 0;
 #endif
 	idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
 +	lru_gen_init_memcg(memcg);
 	return memcg;
 fail:
 	mem_cgroup_id_remove(memcg);
 diff --git a/mm/memory.c b/mm/memory.c
 index 68294ce1cb06..cd1b5bfd9f3e 100644
 --- a/mm/memory.c
 +++ b/mm/memory.c
@@ -5108,6 +5108,27 @@ static inline void mm_account_fault(struct pt_regs *regs,
 		perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
 }
 +#ifdef CONFIG_LRU_GEN
 +static void lru_gen_enter_fault(struct vm_area_struct *vma)
 +{
 +	/* the LRU algorithm doesn't apply to sequential or random reads */
 +	current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ));
 +}
 +
 +static void lru_gen_exit_fault(void)
 +{
 +	current->in_lru_fault = false;
 +}
 +#else
 +static void lru_gen_enter_fault(struct vm_area_struct *vma)
 +{
 +}
 +
 +static void lru_gen_exit_fault(void)
 +{
 +}
 +#endif /* CONFIG_LRU_GEN */
 +
 /*
  * By the time we get here, we already hold the mm semaphore
  *
@@ -5139,11 +5160,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 	if (flags & FAULT_FLAG_USER)
 		mem_cgroup_enter_user_fault();
 +	lru_gen_enter_fault(vma);
 +
 	if (unlikely(is_vm_hugetlb_page(vma)))
 		ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
 	else
 		ret = __handle_mm_fault(vma, address, flags);
 +	lru_gen_exit_fault();
 +
 	if (flags & FAULT_FLAG_USER) {
 		mem_cgroup_exit_user_fault();
 		/*
 diff --git a/mm/mm_init.c b/mm/mm_init.c
 index 9ddaf0e1b0ab..0d7b2bd2454a 100644
 --- a/mm/mm_init.c
 +++ b/mm/mm_init.c
@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layout(void)
 	shift = 8 * sizeof(unsigned long);
 	width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
 -		- LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
 +		- LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
 	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
 -		"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
 +		"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
 		SECTIONS_WIDTH,
 		NODES_WIDTH,
 		ZONES_WIDTH,
 		LAST_CPUPID_WIDTH,
 		KASAN_TAG_WIDTH,
 +		LRU_GEN_WIDTH,
 +		LRU_REFS_WIDTH,
 		NR_PAGEFLAGS);
 	mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
 		"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
 diff --git a/mm/mmzone.c b/mm/mmzone.c
 index 0ae7571e35ab..68e1511be12d 100644
 --- a/mm/mmzone.c
 +++ b/mm/mmzone.c
@@ -88,6 +88,8 @@ void lruvec_init(struct lruvec *lruvec)
 	 * Poison its list head, so that any operations on it would crash.
 	 */
 	list_del(&lruvec->lists[LRU_UNEVICTABLE]);
 +
 +	lru_gen_init_lruvec(lruvec);
 }
 #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
 diff --git a/mm/swap.c b/mm/swap.c
 index 9cee7f6a3809..0e423b7d458b 100644
 --- a/mm/swap.c
 +++ b/mm/swap.c
@@ -484,6 +484,11 @@ void folio_add_lru(struct folio *folio)
 			folio_test_unevictable(folio), folio);
 	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
 +	/* see the comment in lru_gen_add_folio() */
 +	if (lru_gen_enabled() && !folio_test_unevictable(folio) &&
 +	    lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
 +		folio_set_active(folio);
 +
 	folio_get(folio);
 	local_lock(&cpu_fbatches.lock);
 	fbatch = this_cpu_ptr(&cpu_fbatches.lru_add);
@@ -575,7 +580,7 @@ static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio)
 static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio)
 {
 -	if (folio_test_active(folio) && !folio_test_unevictable(folio)) {
 +	if (!folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) {
 		long nr_pages = folio_nr_pages(folio);
 		lruvec_del_folio(lruvec, folio);
@@ -688,8 +693,8 @@ void deactivate_page(struct page *page)
 {
 	struct folio *folio = page_folio(page);
 -	if (folio_test_lru(folio) && folio_test_active(folio) &&
 -	    !folio_test_unevictable(folio)) {
 +	if (folio_test_lru(folio) && !folio_test_unevictable(folio) &&
 +	    (folio_test_active(folio) || lru_gen_enabled())) {
 		struct folio_batch *fbatch;
 		folio_get(folio);
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 index 4e4331367db9..fb76cfe2fdc2 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
@@ -3050,6 +3050,81 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
 	return can_demote(pgdat->node_id, sc);
 }
 +#ifdef CONFIG_LRU_GEN
 +
 +/******************************************************************************
 + *                          shorthand helpers
 + ******************************************************************************/
 +
 +#define for_each_gen_type_zone(gen, type, zone)				\
 +	for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++)			\
 +		for ((type) = 0; (type) < ANON_AND_FILE; (type)++)	\
 +			for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
 +
 +static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid)
 +{
 +	struct pglist_data *pgdat = NODE_DATA(nid);
 +
 +#ifdef CONFIG_MEMCG
 +	if (memcg) {
 +		struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
 +
 +		/* for hotadd_new_pgdat() */
 +		if (!lruvec->pgdat)
 +			lruvec->pgdat = pgdat;
 +
 +		return lruvec;
 +	}
 +#endif
 +	VM_WARN_ON_ONCE(!mem_cgroup_disabled());
 +
 +	return pgdat ? &pgdat->__lruvec : NULL;
 +}
 +
 +/******************************************************************************
 + *                          initialization
 + ******************************************************************************/
 +
 +void lru_gen_init_lruvec(struct lruvec *lruvec)
 +{
 +	int gen, type, zone;
 +	struct lru_gen_struct *lrugen = &lruvec->lrugen;
 +
 +	lrugen->max_seq = MIN_NR_GENS + 1;
 +
 +	for_each_gen_type_zone(gen, type, zone)
 +		INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
 +}
 +
 +#ifdef CONFIG_MEMCG
 +void lru_gen_init_memcg(struct mem_cgroup *memcg)
 +{
 +}
 +
 +void lru_gen_exit_memcg(struct mem_cgroup *memcg)
 +{
 +	int nid;
 +
 +	for_each_node(nid) {
 +		struct lruvec *lruvec = get_lruvec(memcg, nid);
 +
 +		VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
 +					   sizeof(lruvec->lrugen.nr_pages)));
 +	}
 +}
 +#endif
 +
 +static int __init init_lru_gen(void)
 +{
 +	BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
 +	BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
 +
 +	return 0;
 +};
 +late_initcall(init_lru_gen);
 +
 +#endif /* CONFIG_LRU_GEN */
 +
 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 {
 	unsigned long nr[NR_LRU_LISTS];
 -- 
 2.17.1
--- a/target/linux/generic/backport-6.0/105-mm-multi-gen-LRU-minimal-implementation.patch
+++ b/target/linux/generic/backport-6.0/105-mm-multi-gen-LRU-minimal-implementation.patch
--- a/target/linux/generic/backport-6.0/106-mm-multi-gen-LRU-exploit-locality-in-rmap.patch
+++ b/target/linux/generic/backport-6.0/106-mm-multi-gen-LRU-exploit-locality-in-rmap.patch
@ -0,0 +1,495 @@
 From 93fa87bdef9e7fa9977355c4712c000f31639231 Mon Sep 17 00:00:00 2001
 From: Yu Zhao <yuzhao@google.com>
 Date: Thu, 27 Jan 2022 20:43:22 -0700
 Subject: [PATCH 07/14] mm: multi-gen LRU: exploit locality in rmap
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 Searching the rmap for PTEs mapping each page on an LRU list (to test
 and clear the accessed bit) can be expensive because pages from
 different VMAs (PA space) are not cache friendly to the rmap (VA
 space). For workloads mostly using mapped pages, searching the rmap
 can incur the highest CPU cost in the reclaim path.
 This patch exploits spatial locality to reduce the trips into the
 rmap. When shrink_page_list() walks the rmap and finds a young PTE, a
 new function lru_gen_look_around() scans at most BITS_PER_LONG-1
 adjacent PTEs. On finding another young PTE, it clears the accessed
 bit and updates the gen counter of the page mapped by this PTE to
 (max_seq%MAX_NR_GENS)+1.
 Server benchmark results:
  Single workload:
    fio (buffered I/O): no change
  Single workload:
    memcached (anon): +[3, 5]%
                Ops/sec      KB/sec
      patch1-6: 1106168.46   43025.04
      patch1-7: 1147696.57   44640.29
  Configurations:
    no change
 Client benchmark results:
  kswapd profiles:
    patch1-6
      39.03%  lzo1x_1_do_compress (real work)
      18.47%  page_vma_mapped_walk (overhead)
       6.74%  _raw_spin_unlock_irq
       3.97%  do_raw_spin_lock
       2.49%  ptep_clear_flush
       2.48%  anon_vma_interval_tree_iter_first
       1.92%  folio_referenced_one
       1.88%  __zram_bvec_write
       1.48%  memmove
       1.31%  vma_interval_tree_iter_next
    patch1-7
      48.16%  lzo1x_1_do_compress (real work)
       8.20%  page_vma_mapped_walk (overhead)
       7.06%  _raw_spin_unlock_irq
       2.92%  ptep_clear_flush
       2.53%  __zram_bvec_write
       2.11%  do_raw_spin_lock
       2.02%  memmove
       1.93%  lru_gen_look_around
       1.56%  free_unref_page_list
       1.40%  memset
  Configurations:
    no change
 Signed-off-by: Yu Zhao <yuzhao@google.com>
 Acked-by: Barry Song <baohua@kernel.org>
 Acked-by: Brian Geffon <bgeffon@google.com>
 Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
 Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
 Acked-by: Steven Barrett <steven@liquorix.net>
 Acked-by: Suleiman Souhlal <suleiman@google.com>
 Tested-by: Daniel Byrne <djbyrne@mtu.edu>
 Tested-by: Donald Carr <d@chaos-reins.com>
 Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
 Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
 Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
 Tested-by: Sofia Trinh <sofia.trinh@edi.works>
 Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
 Change-Id: I4b9ca0fd20f566ce554e703f14cee3fe0048c2fd
 ---
 include/linux/memcontrol.h |  31 +++++++
 include/linux/mm.h         |   5 +
 include/linux/mmzone.h     |   6 ++
 mm/internal.h              |   1 +
 mm/memcontrol.c            |   1 +
 mm/rmap.c                  |   6 ++
 mm/swap.c                  |   4 +-
 mm/vmscan.c                | 184 +++++++++++++++++++++++++++++++++++++
 8 files changed, 236 insertions(+), 2 deletions(-)
 diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
 index 567f12323f55..d2b7f6b9998c 100644
 --- a/include/linux/memcontrol.h
 +++ b/include/linux/memcontrol.h
@@ -444,6 +444,7 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
  * - LRU isolation
  * - lock_page_memcg()
  * - exclusive reference
 + * - mem_cgroup_trylock_pages()
  *
  * For a kmem folio a caller should hold an rcu read lock to protect memcg
  * associated with a kmem folio from being released.
@@ -505,6 +506,7 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio)
  * - LRU isolation
  * - lock_page_memcg()
  * - exclusive reference
 + * - mem_cgroup_trylock_pages()
  *
  * For a kmem page a caller should hold an rcu read lock to protect memcg
  * associated with a kmem page from being released.
@@ -959,6 +961,23 @@ void unlock_page_memcg(struct page *page);
 void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);
 +/* try to stablize folio_memcg() for all the pages in a memcg */
 +static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
 +{
 +	rcu_read_lock();
 +
 +	if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account))
 +		return true;
 +
 +	rcu_read_unlock();
 +	return false;
 +}
 +
 +static inline void mem_cgroup_unlock_pages(void)
 +{
 +	rcu_read_unlock();
 +}
 +
 /* idx can be of type enum memcg_stat_item or node_stat_item */
 static inline void mod_memcg_state(struct mem_cgroup *memcg,
 				   int idx, int val)
@@ -1433,6 +1452,18 @@ static inline void folio_memcg_unlock(struct folio *folio)
 {
 }
 +static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
 +{
 +	/* to match folio_memcg_rcu() */
 +	rcu_read_lock();
 +	return true;
 +}
 +
 +static inline void mem_cgroup_unlock_pages(void)
 +{
 +	rcu_read_unlock();
 +}
 +
 static inline void mem_cgroup_handle_over_high(void)
 {
 }
 diff --git a/include/linux/mm.h b/include/linux/mm.h
 index 21f8b27bd9fd..88976a521ef5 100644
 --- a/include/linux/mm.h
 +++ b/include/linux/mm.h
@@ -1465,6 +1465,11 @@ static inline unsigned long folio_pfn(struct folio *folio)
 	return page_to_pfn(&folio->page);
 }
 +static inline struct folio *pfn_folio(unsigned long pfn)
 +{
 +	return page_folio(pfn_to_page(pfn));
 +}
 +
 static inline atomic_t *folio_pincount_ptr(struct folio *folio)
 {
 	return &folio_page(folio, 1)->compound_pincount;
 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
 index 07bd22149c22..2b4dc60d0f6c 100644
 --- a/include/linux/mmzone.h
 +++ b/include/linux/mmzone.h
@@ -372,6 +372,7 @@ enum lruvec_flags {
 #ifndef __GENERATING_BOUNDS_H
 struct lruvec;
 +struct page_vma_mapped_walk;
 #define LRU_GEN_MASK		((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
 #define LRU_REFS_MASK		((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
@@ -427,6 +428,7 @@ struct lru_gen_struct {
 };
 void lru_gen_init_lruvec(struct lruvec *lruvec);
 +void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
 #ifdef CONFIG_MEMCG
 void lru_gen_init_memcg(struct mem_cgroup *memcg);
@@ -439,6 +441,10 @@ static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
 {
 }
 +static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 +{
 +}
 +
 #ifdef CONFIG_MEMCG
 static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
 {
 diff --git a/mm/internal.h b/mm/internal.h
 index 785409805ed7..a1fddea6b34f 100644
 --- a/mm/internal.h
 +++ b/mm/internal.h
@@ -83,6 +83,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf);
 void folio_rotate_reclaimable(struct folio *folio);
 bool __folio_end_writeback(struct folio *folio);
 void deactivate_file_folio(struct folio *folio);
 +void folio_activate(struct folio *folio);
 void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
 		unsigned long floor, unsigned long ceiling);
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
 index 5fd38d12149c..882180866e31 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
@@ -2789,6 +2789,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
 	 * - LRU isolation
 	 * - lock_page_memcg()
 	 * - exclusive reference
 +	 * - mem_cgroup_trylock_pages()
 	 */
 	folio->memcg_data = (unsigned long)memcg;
 }
 diff --git a/mm/rmap.c b/mm/rmap.c
 index 93d5a6f793d2..9e0ce48bca08 100644
 --- a/mm/rmap.c
 +++ b/mm/rmap.c
@@ -833,6 +833,12 @@ static bool folio_referenced_one(struct folio *folio,
 		}
 		if (pvmw.pte) {
 +			if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
 +			    !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
 +				lru_gen_look_around(&pvmw);
 +				referenced++;
 +			}
 +
 			if (ptep_clear_flush_young_notify(vma, address,
 						pvmw.pte)) {
 				/*
 diff --git a/mm/swap.c b/mm/swap.c
 index f74fd51fa9e1..0a3871a70952 100644
 --- a/mm/swap.c
 +++ b/mm/swap.c
@@ -366,7 +366,7 @@ static void folio_activate_drain(int cpu)
 		folio_batch_move_lru(fbatch, folio_activate_fn);
 }
 -static void folio_activate(struct folio *folio)
 +void folio_activate(struct folio *folio)
 {
 	if (folio_test_lru(folio) && !folio_test_active(folio) &&
 	    !folio_test_unevictable(folio)) {
@@ -385,7 +385,7 @@ static inline void folio_activate_drain(int cpu)
 {
 }
 -static void folio_activate(struct folio *folio)
 +void folio_activate(struct folio *folio)
 {
 	struct lruvec *lruvec;
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 index 8e63f95a5f53..8686918e238d 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
@@ -1635,6 +1635,11 @@ static unsigned int shrink_page_list(struct list_head *page_list,
 		if (!sc->may_unmap && folio_mapped(folio))
 			goto keep_locked;
 +		/* folio_update_gen() tried to promote this page? */
 +		if (lru_gen_enabled() && !ignore_references &&
 +		    folio_mapped(folio) && folio_test_referenced(folio))
 +			goto keep_locked;
 +
 		/*
 		 * The number of dirty pages determines if a node is marked
 		 * reclaim_congested. kswapd will stall and start writing
@@ -3219,6 +3224,29 @@ static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
  *                          the aging
  ******************************************************************************/
 +/* promote pages accessed through page tables */
 +static int folio_update_gen(struct folio *folio, int gen)
 +{
 +	unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
 +
 +	VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
 +	VM_WARN_ON_ONCE(!rcu_read_lock_held());
 +
 +	do {
 +		/* lru_gen_del_folio() has isolated this page? */
 +		if (!(old_flags & LRU_GEN_MASK)) {
 +			/* for shrink_page_list() */
 +			new_flags = old_flags | BIT(PG_referenced);
 +			continue;
 +		}
 +
 +		new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
 +		new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
 +	} while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
 +
 +	return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
 +}
 +
 /* protect pages accessed multiple times through file descriptors */
 static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
 {
@@ -3230,6 +3258,11 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
 	VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio);
 	do {
 +		new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
 +		/* folio_update_gen() has promoted this page? */
 +		if (new_gen >= 0 && new_gen != old_gen)
 +			return new_gen;
 +
 		new_gen = (old_gen + 1) % MAX_NR_GENS;
 		new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
@@ -3244,6 +3277,43 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai
 	return new_gen;
 }
 +static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
 +{
 +	unsigned long pfn = pte_pfn(pte);
 +
 +	VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
 +
 +	if (!pte_present(pte) || is_zero_pfn(pfn))
 +		return -1;
 +
 +	if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
 +		return -1;
 +
 +	if (WARN_ON_ONCE(!pfn_valid(pfn)))
 +		return -1;
 +
 +	return pfn;
 +}
 +
 +static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
 +				   struct pglist_data *pgdat)
 +{
 +	struct folio *folio;
 +
 +	/* try to avoid unnecessary memory loads */
 +	if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
 +		return NULL;
 +
 +	folio = pfn_folio(pfn);
 +	if (folio_nid(folio) != pgdat->node_id)
 +		return NULL;
 +
 +	if (folio_memcg_rcu(folio) != memcg)
 +		return NULL;
 +
 +	return folio;
 +}
 +
 static void inc_min_seq(struct lruvec *lruvec, int type)
 {
 	struct lru_gen_struct *lrugen = &lruvec->lrugen;
@@ -3443,6 +3513,114 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
 	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
 }
 +/*
 + * This function exploits spatial locality when shrink_page_list() walks the
 + * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
 + */
 +void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
 +{
 +	int i;
 +	pte_t *pte;
 +	unsigned long start;
 +	unsigned long end;
 +	unsigned long addr;
 +	unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
 +	struct folio *folio = pfn_folio(pvmw->pfn);
 +	struct mem_cgroup *memcg = folio_memcg(folio);
 +	struct pglist_data *pgdat = folio_pgdat(folio);
 +	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
 +	DEFINE_MAX_SEQ(lruvec);
 +	int old_gen, new_gen = lru_gen_from_seq(max_seq);
 +
 +	lockdep_assert_held(pvmw->ptl);
 +	VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
 +
 +	if (spin_is_contended(pvmw->ptl))
 +		return;
 +
 +	start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
 +	end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
 +
 +	if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
 +		if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
 +			end = start + MIN_LRU_BATCH * PAGE_SIZE;
 +		else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2)
 +			start = end - MIN_LRU_BATCH * PAGE_SIZE;
 +		else {
 +			start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2;
 +			end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2;
 +		}
 +	}
 +
 +	pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE;
 +
 +	rcu_read_lock();
 +	arch_enter_lazy_mmu_mode();
 +
 +	for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
 +		unsigned long pfn;
 +
 +		pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
 +		if (pfn == -1)
 +			continue;
 +
 +		if (!pte_young(pte[i]))
 +			continue;
 +
 +		folio = get_pfn_folio(pfn, memcg, pgdat);
 +		if (!folio)
 +			continue;
 +
 +		if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
 +			VM_WARN_ON_ONCE(true);
 +
 +		if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
 +		    !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
 +		      !folio_test_swapcache(folio)))
 +			folio_mark_dirty(folio);
 +
 +		old_gen = folio_lru_gen(folio);
 +		if (old_gen < 0)
 +			folio_set_referenced(folio);
 +		else if (old_gen != new_gen)
 +			__set_bit(i, bitmap);
 +	}
 +
 +	arch_leave_lazy_mmu_mode();
 +	rcu_read_unlock();
 +
 +	if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
 +		for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
 +			folio = pfn_folio(pte_pfn(pte[i]));
 +			folio_activate(folio);
 +		}
 +		return;
 +	}
 +
 +	/* folio_update_gen() requires stable folio_memcg() */
 +	if (!mem_cgroup_trylock_pages(memcg))
 +		return;
 +
 +	spin_lock_irq(&lruvec->lru_lock);
 +	new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
 +
 +	for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
 +		folio = pfn_folio(pte_pfn(pte[i]));
 +		if (folio_memcg_rcu(folio) != memcg)
 +			continue;
 +
 +		old_gen = folio_update_gen(folio, new_gen);
 +		if (old_gen < 0 || old_gen == new_gen)
 +			continue;
 +
 +		lru_gen_update_size(lruvec, folio, old_gen, new_gen);
 +	}
 +
 +	spin_unlock_irq(&lruvec->lru_lock);
 +
 +	mem_cgroup_unlock_pages();
 +}
 +
 /******************************************************************************
  *                          the eviction
  ******************************************************************************/
@@ -3479,6 +3657,12 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx)
 		return true;
 	}
 +	/* promoted */
 +	if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
 +		list_move(&folio->lru, &lrugen->lists[gen][type][zone]);
 +		return true;
 +	}
 +
 	/* protected */
 	if (tier > tier_idx) {
 		int hist = lru_hist_from_seq(lrugen->min_seq[type]);
 -- 
 2.17.1
--- a/target/linux/generic/backport-6.0/107-mm-multi-gen-LRU-support-page-table-walks.patch
+++ b/target/linux/generic/backport-6.0/107-mm-multi-gen-LRU-support-page-table-walks.patch
--- a/target/linux/generic/backport-6.0/108-mm-multi-gen-LRU-optimize-multiple-memcgs.patch
+++ b/target/linux/generic/backport-6.0/108-mm-multi-gen-LRU-optimize-multiple-memcgs.patch
@ -0,0 +1,295 @@
 From 6b9670b94ba2b49b289b997121062500e32fc3e4 Mon Sep 17 00:00:00 2001
 From: Yu Zhao <yuzhao@google.com>
 Date: Thu, 27 Jan 2022 19:59:54 -0700
 Subject: [PATCH 09/14] mm: multi-gen LRU: optimize multiple memcgs
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 When multiple memcgs are available, it is possible to use generations
 as a frame of reference to make better choices and improve overall
 performance under global memory pressure. This patch adds a basic
 optimization to select memcgs that can drop single-use unmapped clean
 pages first. Doing so reduces the chance of going into the aging path
 or swapping, which can be costly.
 A typical example that benefits from this optimization is a server
 running mixed types of workloads, e.g., heavy anon workload in one
 memcg and heavy buffered I/O workload in the other.
 Though this optimization can be applied to both kswapd and direct
 reclaim, it is only added to kswapd to keep the patchset manageable.
 Later improvements may cover the direct reclaim path.
 While ensuring certain fairness to all eligible memcgs, proportional
 scans of individual memcgs also require proper backoff to avoid
 overshooting their aggregate reclaim target by too much. Otherwise it
 can cause high direct reclaim latency. The conditions for backoff are:
 1. At low priorities, for direct reclaim, if aging fairness or direct
   reclaim latency is at risk, i.e., aging one memcg multiple times or
   swapping after the target is met.
 2. At high priorities, for global reclaim, if per-zone free pages are
   above respective watermarks.
 Server benchmark results:
  Mixed workloads:
    fio (buffered I/O): +[19, 21]%
                IOPS         BW
      patch1-8: 1880k        7343MiB/s
      patch1-9: 2252k        8796MiB/s
    memcached (anon): +[119, 123]%
                Ops/sec      KB/sec
      patch1-8: 862768.65    33514.68
      patch1-9: 1911022.12   74234.54
  Mixed workloads:
    fio (buffered I/O): +[75, 77]%
                IOPS         BW
      5.19-rc1: 1279k        4996MiB/s
      patch1-9: 2252k        8796MiB/s
    memcached (anon): +[13, 15]%
                Ops/sec      KB/sec
      5.19-rc1: 1673524.04   65008.87
      patch1-9: 1911022.12   74234.54
  Configurations:
    (changes since patch 6)
    cat mixed.sh
    modprobe brd rd_nr=2 rd_size=56623104
    swapoff -a
    mkswap /dev/ram0
    swapon /dev/ram0
    mkfs.ext4 /dev/ram1
    mount -t ext4 /dev/ram1 /mnt
    memtier_benchmark -S /var/run/memcached/memcached.sock \
      -P memcache_binary -n allkeys --key-minimum=1 \
      --key-maximum=50000000 --key-pattern=P:P -c 1 -t 36 \
      --ratio 1:0 --pipeline 8 -d 2000
    fio -name=mglru --numjobs=36 --directory=/mnt --size=1408m \
      --buffered=1 --ioengine=io_uring --iodepth=128 \
      --iodepth_batch_submit=32 --iodepth_batch_complete=32 \
      --rw=randread --random_distribution=random --norandommap \
      --time_based --ramp_time=10m --runtime=90m --group_reporting &
    pid=$!
    sleep 200
    memtier_benchmark -S /var/run/memcached/memcached.sock \
      -P memcache_binary -n allkeys --key-minimum=1 \
      --key-maximum=50000000 --key-pattern=R:R -c 1 -t 36 \
      --ratio 0:1 --pipeline 8 --randomize --distinct-client-seed
    kill -INT $pid
    wait
 Client benchmark results:
  no change (CONFIG_MEMCG=n)
 Signed-off-by: Yu Zhao <yuzhao@google.com>
 Acked-by: Brian Geffon <bgeffon@google.com>
 Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
 Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
 Acked-by: Steven Barrett <steven@liquorix.net>
 Acked-by: Suleiman Souhlal <suleiman@google.com>
 Tested-by: Daniel Byrne <djbyrne@mtu.edu>
 Tested-by: Donald Carr <d@chaos-reins.com>
 Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
 Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
 Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
 Tested-by: Sofia Trinh <sofia.trinh@edi.works>
 Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
 Change-Id: I7e00e0c733437e534ac98031cf8154a681becc00
 ---
 mm/vmscan.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 95 insertions(+), 9 deletions(-)
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 index fcb437769a60..e7b74ab67973 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
@@ -131,6 +131,12 @@ struct scan_control {
 	/* Always discard instead of demoting to lower tier memory */
 	unsigned int no_demotion:1;
 +#ifdef CONFIG_LRU_GEN
 +	/* help kswapd make better choices among multiple memcgs */
 +	unsigned int memcgs_need_aging:1;
 +	unsigned long last_reclaimed;
 +#endif
 +
 	/* Allocation order */
 	s8 order;
@@ -4429,6 +4435,19 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
 	VM_WARN_ON_ONCE(!current_is_kswapd());
 +	sc->last_reclaimed = sc->nr_reclaimed;
 +
 +	/*
 +	 * To reduce the chance of going into the aging path, which can be
 +	 * costly, optimistically skip it if the flag below was cleared in the
 +	 * eviction path. This improves the overall performance when multiple
 +	 * memcgs are available.
 +	 */
 +	if (!sc->memcgs_need_aging) {
 +		sc->memcgs_need_aging = true;
 +		return;
 +	}
 +
 	set_mm_walk(pgdat);
 	memcg = mem_cgroup_iter(NULL, NULL, NULL);
@@ -4840,7 +4859,8 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw
 	return scanned;
 }
 -static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
 +static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
 +			bool *need_swapping)
 {
 	int type;
 	int scanned;
@@ -4903,6 +4923,9 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
 	sc->nr_reclaimed += reclaimed;
 +	if (need_swapping && type == LRU_GEN_ANON)
 +		*need_swapping = true;
 +
 	return scanned;
 }
@@ -4912,9 +4935,8 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
  *    reclaim.
  */
 static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
 -				    bool can_swap)
 +				    bool can_swap, bool *need_aging)
 {
 -	bool need_aging;
 	unsigned long nr_to_scan;
 	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 	DEFINE_MAX_SEQ(lruvec);
@@ -4924,8 +4946,8 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
 	    (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
 		return 0;
 -	need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
 -	if (!need_aging)
 +	*need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
 +	if (!*need_aging)
 		return nr_to_scan;
 	/* skip the aging path at the default priority */
@@ -4942,10 +4964,67 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
 	return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
 }
 +static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
 +			      struct scan_control *sc, bool need_swapping)
 +{
 +	int i;
 +	DEFINE_MAX_SEQ(lruvec);
 +
 +	if (!current_is_kswapd()) {
 +		/* age each memcg at most once to ensure fairness */
 +		if (max_seq - seq > 1)
 +			return true;
 +
 +		/* over-swapping can increase allocation latency */
 +		if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
 +			return true;
 +
 +		/* give this thread a chance to exit and free its memory */
 +		if (fatal_signal_pending(current)) {
 +			sc->nr_reclaimed += MIN_LRU_BATCH;
 +			return true;
 +		}
 +
 +		if (cgroup_reclaim(sc))
 +			return false;
 +	} else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
 +		return false;
 +
 +	/* keep scanning at low priorities to ensure fairness */
 +	if (sc->priority > DEF_PRIORITY - 2)
 +		return false;
 +
 +	/*
 +	 * A minimum amount of work was done under global memory pressure. For
 +	 * kswapd, it may be overshooting. For direct reclaim, the allocation
 +	 * may succeed if all suitable zones are somewhat safe. In either case,
 +	 * it's better to stop now, and restart later if necessary.
 +	 */
 +	for (i = 0; i <= sc->reclaim_idx; i++) {
 +		unsigned long wmark;
 +		struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
 +
 +		if (!managed_zone(zone))
 +			continue;
 +
 +		wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
 +		if (wmark > zone_page_state(zone, NR_FREE_PAGES))
 +			return false;
 +	}
 +
 +	sc->nr_reclaimed += MIN_LRU_BATCH;
 +
 +	return true;
 +}
 +
 static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 {
 	struct blk_plug plug;
 +	bool need_aging = false;
 +	bool need_swapping = false;
 	unsigned long scanned = 0;
 +	unsigned long reclaimed = sc->nr_reclaimed;
 +	DEFINE_MAX_SEQ(lruvec);
 	lru_add_drain();
@@ -4965,21 +5044,28 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
 		else
 			swappiness = 0;
 -		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
 +		nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
 		if (!nr_to_scan)
 -			break;
 +			goto done;
 -		delta = evict_folios(lruvec, sc, swappiness);
 +		delta = evict_folios(lruvec, sc, swappiness, &need_swapping);
 		if (!delta)
 -			break;
 +			goto done;
 		scanned += delta;
 		if (scanned >= nr_to_scan)
 			break;
 +		if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
 +			break;
 +
 		cond_resched();
 	}
 +	/* see the comment in lru_gen_age_node() */
 +	if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
 +		sc->memcgs_need_aging = false;
 +done:
 	clear_mm_walk();
 	blk_finish_plug(&plug);
 -- 
 2.17.1
--- a/target/linux/generic/backport-6.0/109-mm-multi-gen-LRU-kill-switch.patch
+++ b/target/linux/generic/backport-6.0/109-mm-multi-gen-LRU-kill-switch.patch
@ -0,0 +1,490 @@
 From ef61bb3622ee0f36e055dfd5006badff08f5ce61 Mon Sep 17 00:00:00 2001
 From: Yu Zhao <yuzhao@google.com>
 Date: Thu, 27 Jan 2022 19:52:09 -0700
 Subject: [PATCH 10/14] mm: multi-gen LRU: kill switch
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 Add /sys/kernel/mm/lru_gen/enabled as a kill switch. Components that
 can be disabled include:
  0x0001: the multi-gen LRU core
  0x0002: walking page table, when arch_has_hw_pte_young() returns
          true
  0x0004: clearing the accessed bit in non-leaf PMD entries, when
          CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y
  [yYnN]: apply to all the components above
 E.g.,
  echo y >/sys/kernel/mm/lru_gen/enabled
  cat /sys/kernel/mm/lru_gen/enabled
  0x0007
  echo 5 >/sys/kernel/mm/lru_gen/enabled
  cat /sys/kernel/mm/lru_gen/enabled
  0x0005
 NB: the page table walks happen on the scale of seconds under heavy
 memory pressure, in which case the mmap_lock contention is a lesser
 concern, compared with the LRU lock contention and the I/O congestion.
 So far the only well-known case of the mmap_lock contention happens on
 Android, due to Scudo [1] which allocates several thousand VMAs for
 merely a few hundred MBs. The SPF and the Maple Tree also have
 provided their own assessments [2][3]. However, if walking page tables
 does worsen the mmap_lock contention, the kill switch can be used to
 disable it. In this case the multi-gen LRU will suffer a minor
 performance degradation, as shown previously.
 Clearing the accessed bit in non-leaf PMD entries can also be
 disabled, since this behavior was not tested on x86 varieties other
 than Intel and AMD.
 [1] https://source.android.com/devices/tech/debug/scudo
 [2] https://lore.kernel.org/r/20220128131006.67712-1-michel@lespinasse.org/
 [3] https://lore.kernel.org/r/20220426150616.3937571-1-Liam.Howlett@oracle.com/
 Signed-off-by: Yu Zhao <yuzhao@google.com>
 Acked-by: Brian Geffon <bgeffon@google.com>
 Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
 Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
 Acked-by: Steven Barrett <steven@liquorix.net>
 Acked-by: Suleiman Souhlal <suleiman@google.com>
 Tested-by: Daniel Byrne <djbyrne@mtu.edu>
 Tested-by: Donald Carr <d@chaos-reins.com>
 Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
 Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
 Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
 Tested-by: Sofia Trinh <sofia.trinh@edi.works>
 Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
 Change-Id: I4c909618e8fed7fb1337f6624bbe542ec920a515
 ---
 include/linux/cgroup.h          |  15 ++-
 include/linux/mm_inline.h       |  15 ++-
 include/linux/mmzone.h          |   9 ++
 kernel/cgroup/cgroup-internal.h |   1 -
 mm/Kconfig                      |   6 +
 mm/vmscan.c                     | 228 +++++++++++++++++++++++++++++++-
 6 files changed, 265 insertions(+), 9 deletions(-)
 diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
 index ac5d0515680e..9179463c3c9f 100644
 --- a/include/linux/cgroup.h
 +++ b/include/linux/cgroup.h
@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgroup *cgrp)
 	css_put(&cgrp->self);
 }
 +extern struct mutex cgroup_mutex;
 +
 +static inline void cgroup_lock(void)
 +{
 +	mutex_lock(&cgroup_mutex);
 +}
 +
 +static inline void cgroup_unlock(void)
 +{
 +	mutex_unlock(&cgroup_mutex);
 +}
 +
 /**
  * task_css_set_check - obtain a task's css_set with extra access conditions
  * @task: the task to obtain css_set for
@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgroup *cgrp)
  * as locks used during the cgroup_subsys::attach() methods.
  */
 #ifdef CONFIG_PROVE_RCU
 -extern struct mutex cgroup_mutex;
 extern spinlock_t css_set_lock;
 #define task_css_set_check(task, __c)					\
 	rcu_dereference_check((task)->cgroups,				\
@@ -708,6 +719,8 @@ struct cgroup;
 static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
 static inline void css_get(struct cgroup_subsys_state *css) {}
 static inline void css_put(struct cgroup_subsys_state *css) {}
 +static inline void cgroup_lock(void) {}
 +static inline void cgroup_unlock(void) {}
 static inline int cgroup_attach_task_all(struct task_struct *from,
 					 struct task_struct *t) { return 0; }
 static inline int cgroupstats_build(struct cgroupstats *stats,
 diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
 index f2b2296a42f9..4949eda9a9a2 100644
 --- a/include/linux/mm_inline.h
 +++ b/include/linux/mm_inline.h
@@ -106,10 +106,21 @@ static __always_inline enum lru_list folio_lru_list(struct folio *folio)
 #ifdef CONFIG_LRU_GEN
 +#ifdef CONFIG_LRU_GEN_ENABLED
 static inline bool lru_gen_enabled(void)
 {
 -	return true;
 +	DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);
 +
 +	return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
 +}
 +#else
 +static inline bool lru_gen_enabled(void)
 +{
 +	DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);
 +
 +	return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]);
 }
 +#endif
 static inline bool lru_gen_in_fault(void)
 {
@@ -222,7 +233,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
 	VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
 -	if (folio_test_unevictable(folio))
 +	if (folio_test_unevictable(folio) || !lrugen->enabled)
 		return false;
 	/*
 	 * There are three common cases for this page:
 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
 index e5cf37dc41a4..39bca2e420b7 100644
 --- a/include/linux/mmzone.h
 +++ b/include/linux/mmzone.h
@@ -384,6 +384,13 @@ enum {
 	LRU_GEN_FILE,
 };
 +enum {
 +	LRU_GEN_CORE,
 +	LRU_GEN_MM_WALK,
 +	LRU_GEN_NONLEAF_YOUNG,
 +	NR_LRU_GEN_CAPS
 +};
 +
 #define MIN_LRU_BATCH		BITS_PER_LONG
 #define MAX_LRU_BATCH		(MIN_LRU_BATCH * 64)
@@ -425,6 +432,8 @@ struct lru_gen_struct {
 	/* can be modified without holding the LRU lock */
 	atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
 	atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
 +	/* whether the multi-gen LRU is enabled */
 +	bool enabled;
 };
 enum {
 diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
 index 36b740cb3d59..63dc3e82be4f 100644
 --- a/kernel/cgroup/cgroup-internal.h
 +++ b/kernel/cgroup/cgroup-internal.h
@@ -164,7 +164,6 @@ struct cgroup_mgctx {
 #define DEFINE_CGROUP_MGCTX(name)						\
 	struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
 -extern struct mutex cgroup_mutex;
 extern spinlock_t css_set_lock;
 extern struct cgroup_subsys *cgroup_subsys[];
 extern struct list_head cgroup_roots;
 diff --git a/mm/Kconfig b/mm/Kconfig
 index 5101dca8f21c..6c86849c4db9 100644
 --- a/mm/Kconfig
 +++ b/mm/Kconfig
@@ -1133,6 +1133,12 @@ config LRU_GEN
 	help
 	  A high performance LRU implementation to overcommit memory.
 +config LRU_GEN_ENABLED
 +	bool "Enable by default"
 +	depends on LRU_GEN
 +	help
 +	  This option enables the multi-gen LRU by default.
 +
 config LRU_GEN_STATS
 	bool "Full stats for debugging"
 	depends on LRU_GEN
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 index e7b74ab67973..ea3d497019ab 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
@@ -51,6 +51,7 @@
 #include <linux/psi.h>
 #include <linux/pagewalk.h>
 #include <linux/shmem_fs.h>
 +#include <linux/ctype.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -3070,6 +3071,14 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
 #ifdef CONFIG_LRU_GEN
 +#ifdef CONFIG_LRU_GEN_ENABLED
 +DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
 +#define get_cap(cap)	static_branch_likely(&lru_gen_caps[cap])
 +#else
 +DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
 +#define get_cap(cap)	static_branch_unlikely(&lru_gen_caps[cap])
 +#endif
 +
 /******************************************************************************
  *                          shorthand helpers
  ******************************************************************************/
@@ -3946,7 +3955,8 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
 			goto next;
 		if (!pmd_trans_huge(pmd[i])) {
 -			if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
 +			if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
 +			    get_cap(LRU_GEN_NONLEAF_YOUNG))
 				pmdp_test_and_clear_young(vma, addr, pmd + i);
 			goto next;
 		}
@@ -4044,10 +4054,12 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
 		walk->mm_stats[MM_NONLEAF_TOTAL]++;
 #ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
 -		if (!pmd_young(val))
 -			continue;
 +		if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
 +			if (!pmd_young(val))
 +				continue;
 -		walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
 +			walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
 +		}
 #endif
 		if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
 			continue;
@@ -4309,7 +4321,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
 	 * handful of PTEs. Spreading the work out over a period of time usually
 	 * is less efficient, but it avoids bursty page faults.
 	 */
 -	if (!arch_has_hw_pte_young()) {
 +	if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
 		success = iterate_mm_list_nowalk(lruvec, max_seq);
 		goto done;
 	}
@@ -5071,6 +5083,208 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
 	blk_finish_plug(&plug);
 }
 +/******************************************************************************
 + *                          state change
 + ******************************************************************************/
 +
 +static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
 +{
 +	struct lru_gen_struct *lrugen = &lruvec->lrugen;
 +
 +	if (lrugen->enabled) {
 +		enum lru_list lru;
 +
 +		for_each_evictable_lru(lru) {
 +			if (!list_empty(&lruvec->lists[lru]))
 +				return false;
 +		}
 +	} else {
 +		int gen, type, zone;
 +
 +		for_each_gen_type_zone(gen, type, zone) {
 +			if (!list_empty(&lrugen->lists[gen][type][zone]))
 +				return false;
 +		}
 +	}
 +
 +	return true;
 +}
 +
 +static bool fill_evictable(struct lruvec *lruvec)
 +{
 +	enum lru_list lru;
 +	int remaining = MAX_LRU_BATCH;
 +
 +	for_each_evictable_lru(lru) {
 +		int type = is_file_lru(lru);
 +		bool active = is_active_lru(lru);
 +		struct list_head *head = &lruvec->lists[lru];
 +
 +		while (!list_empty(head)) {
 +			bool success;
 +			struct folio *folio = lru_to_folio(head);
 +
 +			VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
 +			VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio);
 +			VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
 +			VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio);
 +
 +			lruvec_del_folio(lruvec, folio);
 +			success = lru_gen_add_folio(lruvec, folio, false);
 +			VM_WARN_ON_ONCE(!success);
 +
 +			if (!--remaining)
 +				return false;
 +		}
 +	}
 +
 +	return true;
 +}
 +
 +static bool drain_evictable(struct lruvec *lruvec)
 +{
 +	int gen, type, zone;
 +	int remaining = MAX_LRU_BATCH;
 +
 +	for_each_gen_type_zone(gen, type, zone) {
 +		struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
 +
 +		while (!list_empty(head)) {
 +			bool success;
 +			struct folio *folio = lru_to_folio(head);
 +
 +			VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
 +			VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
 +			VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
 +			VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
 +
 +			success = lru_gen_del_folio(lruvec, folio, false);
 +			VM_WARN_ON_ONCE(!success);
 +			lruvec_add_folio(lruvec, folio);
 +
 +			if (!--remaining)
 +				return false;
 +		}
 +	}
 +
 +	return true;
 +}
 +
 +static void lru_gen_change_state(bool enabled)
 +{
 +	static DEFINE_MUTEX(state_mutex);
 +
 +	struct mem_cgroup *memcg;
 +
 +	cgroup_lock();
 +	cpus_read_lock();
 +	get_online_mems();
 +	mutex_lock(&state_mutex);
 +
 +	if (enabled == lru_gen_enabled())
 +		goto unlock;
 +
 +	if (enabled)
 +		static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
 +	else
 +		static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
 +
 +	memcg = mem_cgroup_iter(NULL, NULL, NULL);
 +	do {
 +		int nid;
 +
 +		for_each_node(nid) {
 +			struct lruvec *lruvec = get_lruvec(memcg, nid);
 +
 +			if (!lruvec)
 +				continue;
 +
 +			spin_lock_irq(&lruvec->lru_lock);
 +
 +			VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
 +			VM_WARN_ON_ONCE(!state_is_valid(lruvec));
 +
 +			lruvec->lrugen.enabled = enabled;
 +
 +			while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
 +				spin_unlock_irq(&lruvec->lru_lock);
 +				cond_resched();
 +				spin_lock_irq(&lruvec->lru_lock);
 +			}
 +
 +			spin_unlock_irq(&lruvec->lru_lock);
 +		}
 +
 +		cond_resched();
 +	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
 +unlock:
 +	mutex_unlock(&state_mutex);
 +	put_online_mems();
 +	cpus_read_unlock();
 +	cgroup_unlock();
 +}
 +
 +/******************************************************************************
 + *                          sysfs interface
 + ******************************************************************************/
 +
 +static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
 +{
 +	unsigned int caps = 0;
 +
 +	if (get_cap(LRU_GEN_CORE))
 +		caps |= BIT(LRU_GEN_CORE);
 +
 +	if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
 +		caps |= BIT(LRU_GEN_MM_WALK);
 +
 +	if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
 +		caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
 +
 +	return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
 +}
 +
 +static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
 +			     const char *buf, size_t len)
 +{
 +	int i;
 +	unsigned int caps;
 +
 +	if (tolower(*buf) == 'n')
 +		caps = 0;
 +	else if (tolower(*buf) == 'y')
 +		caps = -1;
 +	else if (kstrtouint(buf, 0, &caps))
 +		return -EINVAL;
 +
 +	for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
 +		bool enabled = caps & BIT(i);
 +
 +		if (i == LRU_GEN_CORE)
 +			lru_gen_change_state(enabled);
 +		else if (enabled)
 +			static_branch_enable(&lru_gen_caps[i]);
 +		else
 +			static_branch_disable(&lru_gen_caps[i]);
 +	}
 +
 +	return len;
 +}
 +
 +static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
 +	enabled, 0644, show_enabled, store_enabled
 +);
 +
 +static struct attribute *lru_gen_attrs[] = {
 +	&lru_gen_enabled_attr.attr,
 +	NULL
 +};
 +
 +static struct attribute_group lru_gen_attr_group = {
 +	.name = "lru_gen",
 +	.attrs = lru_gen_attrs,
 +};
 +
 /******************************************************************************
  *                          initialization
  ******************************************************************************/
@@ -5081,6 +5295,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
 	struct lru_gen_struct *lrugen = &lruvec->lrugen;
 	lrugen->max_seq = MIN_NR_GENS + 1;
 +	lrugen->enabled = lru_gen_enabled();
 	for_each_gen_type_zone(gen, type, zone)
 		INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
@@ -5120,6 +5335,9 @@ static int __init init_lru_gen(void)
 	BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
 	BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
 +	if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
 +		pr_err("lru_gen: failed to create sysfs group\n");
 +
 	return 0;
 };
 late_initcall(init_lru_gen);
 -- 
 2.17.1
--- a/target/linux/generic/backport-6.0/110-mm-multi-gen-LRU-thrashing-prevention.patch
+++ b/target/linux/generic/backport-6.0/110-mm-multi-gen-LRU-thrashing-prevention.patch
@ -0,0 +1,209 @@
 From 9d92c76fb8ac09ff195024139575d8c4db66b672 Mon Sep 17 00:00:00 2001
 From: Yu Zhao <yuzhao@google.com>
 Date: Thu, 27 Jan 2022 20:08:50 -0700
 Subject: [PATCH 11/14] mm: multi-gen LRU: thrashing prevention
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 Add /sys/kernel/mm/lru_gen/min_ttl_ms for thrashing prevention, as
 requested by many desktop users [1].
 When set to value N, it prevents the working set of N milliseconds
 from getting evicted. The OOM killer is triggered if this working set
 cannot be kept in memory. Based on the average human detectable lag
 (~100ms), N=1000 usually eliminates intolerable lags due to thrashing.
 Larger values like N=3000 make lags less noticeable at the risk of
 premature OOM kills.
 Compared with the size-based approach [2], this time-based approach
 has the following advantages:
 1. It is easier to configure because it is agnostic to applications
   and memory sizes.
 2. It is more reliable because it is directly wired to the OOM killer.
 [1] https://lore.kernel.org/r/Ydza%2FzXKY9ATRoh6@google.com/
 [2] https://lore.kernel.org/r/20101028191523.GA14972@google.com/
 Signed-off-by: Yu Zhao <yuzhao@google.com>
 Acked-by: Brian Geffon <bgeffon@google.com>
 Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
 Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
 Acked-by: Steven Barrett <steven@liquorix.net>
 Acked-by: Suleiman Souhlal <suleiman@google.com>
 Tested-by: Daniel Byrne <djbyrne@mtu.edu>
 Tested-by: Donald Carr <d@chaos-reins.com>
 Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
 Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
 Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
 Tested-by: Sofia Trinh <sofia.trinh@edi.works>
 Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
 Change-Id: I007499d7e47374b59fd620e8c3962940bc9f788e
 ---
 include/linux/mmzone.h |  2 ++
 mm/vmscan.c            | 74 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 73 insertions(+), 3 deletions(-)
 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
 index 39bca2e420b7..0c502618b37b 100644
 --- a/include/linux/mmzone.h
 +++ b/include/linux/mmzone.h
@@ -419,6 +419,8 @@ struct lru_gen_struct {
 	unsigned long max_seq;
 	/* the eviction increments the oldest generation numbers */
 	unsigned long min_seq[ANON_AND_FILE];
 +	/* the birth time of each generation in jiffies */
 +	unsigned long timestamps[MAX_NR_GENS];
 	/* the multi-gen LRU lists, lazily sorted on eviction */
 	struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
 	/* the multi-gen LRU sizes, eventually consistent */
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 index ea3d497019ab..0df253819edc 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
@@ -4293,6 +4293,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
 	for (type = 0; type < ANON_AND_FILE; type++)
 		reset_ctrl_pos(lruvec, type, false);
 +	WRITE_ONCE(lrugen->timestamps[next], jiffies);
 	/* make sure preceding modifications appear */
 	smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
@@ -4420,7 +4421,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsig
 	return false;
 }
 -static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 +static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl)
 {
 	bool need_aging;
 	unsigned long nr_to_scan;
@@ -4434,16 +4435,36 @@ static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 	mem_cgroup_calculate_protection(NULL, memcg);
 	if (mem_cgroup_below_min(memcg))
 -		return;
 +		return false;
 	need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
 +
 +	if (min_ttl) {
 +		int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
 +		unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
 +
 +		if (time_is_after_jiffies(birth + min_ttl))
 +			return false;
 +
 +		/* the size is likely too small to be helpful */
 +		if (!nr_to_scan && sc->priority != DEF_PRIORITY)
 +			return false;
 +	}
 +
 	if (need_aging)
 		try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
 +
 +	return true;
 }
 +/* to protect the working set of the last N jiffies */
 +static unsigned long lru_gen_min_ttl __read_mostly;
 +
 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
 {
 	struct mem_cgroup *memcg;
 +	bool success = false;
 +	unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
 	VM_WARN_ON_ONCE(!current_is_kswapd());
@@ -4466,12 +4487,32 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
 	do {
 		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
 -		age_lruvec(lruvec, sc);
 +		if (age_lruvec(lruvec, sc, min_ttl))
 +			success = true;
 		cond_resched();
 	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
 	clear_mm_walk();
 +
 +	/* check the order to exclude compaction-induced reclaim */
 +	if (success || !min_ttl || sc->order)
 +		return;
 +
 +	/*
 +	 * The main goal is to OOM kill if every generation from all memcgs is
 +	 * younger than min_ttl. However, another possibility is all memcgs are
 +	 * either below min or empty.
 +	 */
 +	if (mutex_trylock(&oom_lock)) {
 +		struct oom_control oc = {
 +			.gfp_mask = sc->gfp_mask,
 +		};
 +
 +		out_of_memory(&oc);
 +
 +		mutex_unlock(&oom_lock);
 +	}
 }
 /*
@@ -5228,6 +5269,28 @@ static void lru_gen_change_state(bool enabled)
  *                          sysfs interface
  ******************************************************************************/
 +static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
 +{
 +	return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
 +}
 +
 +static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
 +			     const char *buf, size_t len)
 +{
 +	unsigned int msecs;
 +
 +	if (kstrtouint(buf, 0, &msecs))
 +		return -EINVAL;
 +
 +	WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
 +
 +	return len;
 +}
 +
 +static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR(
 +	min_ttl_ms, 0644, show_min_ttl, store_min_ttl
 +);
 +
 static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
 {
 	unsigned int caps = 0;
@@ -5276,6 +5339,7 @@ static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
 );
 static struct attribute *lru_gen_attrs[] = {
 +	&lru_gen_min_ttl_attr.attr,
 	&lru_gen_enabled_attr.attr,
 	NULL
 };
@@ -5291,12 +5355,16 @@ static struct attribute_group lru_gen_attr_group = {
 void lru_gen_init_lruvec(struct lruvec *lruvec)
 {
 +	int i;
 	int gen, type, zone;
 	struct lru_gen_struct *lrugen = &lruvec->lrugen;
 	lrugen->max_seq = MIN_NR_GENS + 1;
 	lrugen->enabled = lru_gen_enabled();
 +	for (i = 0; i <= MIN_NR_GENS + 1; i++)
 +		lrugen->timestamps[i] = jiffies;
 +
 	for_each_gen_type_zone(gen, type, zone)
 		INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
 -- 
 2.17.1
--- a/target/linux/generic/backport-6.0/111-mm-multi-gen-LRU-debugfs-interface.patch
+++ b/target/linux/generic/backport-6.0/111-mm-multi-gen-LRU-debugfs-interface.patch
@ -0,0 +1,564 @@
 From d1e0e5fcdea16d4ceead496a0ea2fdbb6bc5bfe4 Mon Sep 17 00:00:00 2001
 From: Yu Zhao <yuzhao@google.com>
 Date: Thu, 27 Jan 2022 20:12:41 -0700
 Subject: [PATCH 12/14] mm: multi-gen LRU: debugfs interface
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 Add /sys/kernel/debug/lru_gen for working set estimation and proactive
 reclaim. These techniques are commonly used to optimize job scheduling
 (bin packing) in data centers [1][2].
 Compared with the page table-based approach and the PFN-based
 approach, this lruvec-based approach has the following advantages:
 1. It offers better choices because it is aware of memcgs, NUMA nodes,
   shared mappings and unmapped page cache.
 2. It is more scalable because it is O(nr_hot_pages), whereas the
   PFN-based approach is O(nr_total_pages).
 Add /sys/kernel/debug/lru_gen_full for debugging.
 [1] https://dl.acm.org/doi/10.1145/3297858.3304053
 [2] https://dl.acm.org/doi/10.1145/3503222.3507731
 Signed-off-by: Yu Zhao <yuzhao@google.com>
 Reviewed-by: Qi Zheng <zhengqi.arch@bytedance.com>
 Acked-by: Brian Geffon <bgeffon@google.com>
 Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
 Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
 Acked-by: Steven Barrett <steven@liquorix.net>
 Acked-by: Suleiman Souhlal <suleiman@google.com>
 Tested-by: Daniel Byrne <djbyrne@mtu.edu>
 Tested-by: Donald Carr <d@chaos-reins.com>
 Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
 Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
 Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
 Tested-by: Sofia Trinh <sofia.trinh@edi.works>
 Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
 Change-Id: I7bb06f14e0a94901a076cc3767d0855d4f1ea3ab
 ---
 include/linux/nodemask.h |   1 +
 mm/vmscan.c              | 411 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 402 insertions(+), 10 deletions(-)
 diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
 index 4b71a96190a8..3a0eec9f2faa 100644
 --- a/include/linux/nodemask.h
 +++ b/include/linux/nodemask.h
@@ -493,6 +493,7 @@ static inline int num_node_state(enum node_states state)
 #define first_online_node	0
 #define first_memory_node	0
 #define next_online_node(nid)	(MAX_NUMNODES)
 +#define next_memory_node(nid)	(MAX_NUMNODES)
 #define nr_node_ids		1U
 #define nr_online_nodes		1U
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 index 0df253819edc..3e7aad06299b 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
@@ -52,6 +52,7 @@
 #include <linux/pagewalk.h>
 #include <linux/shmem_fs.h>
 #include <linux/ctype.h>
 +#include <linux/debugfs.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -4197,12 +4198,40 @@ static void clear_mm_walk(void)
 		kfree(walk);
 }
 -static void inc_min_seq(struct lruvec *lruvec, int type)
 +static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
 {
 +	int zone;
 +	int remaining = MAX_LRU_BATCH;
 	struct lru_gen_struct *lrugen = &lruvec->lrugen;
 +	int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
 +
 +	if (type == LRU_GEN_ANON && !can_swap)
 +		goto done;
 +
 +	/* prevent cold/hot inversion if force_scan is true */
 +	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 +		struct list_head *head = &lrugen->lists[old_gen][type][zone];
 +
 +		while (!list_empty(head)) {
 +			struct folio *folio = lru_to_folio(head);
 +
 +			VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
 +			VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
 +			VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
 +			VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
 +			new_gen = folio_inc_gen(lruvec, folio, false);
 +			list_move_tail(&folio->lru, &lrugen->lists[new_gen][type][zone]);
 +
 +			if (!--remaining)
 +				return false;
 +		}
 +	}
 +done:
 	reset_ctrl_pos(lruvec, type, true);
 	WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
 +
 +	return true;
 }
 static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
@@ -4248,7 +4277,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
 	return success;
 }
 -static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
 +static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
 {
 	int prev, next;
 	int type, zone;
@@ -4262,9 +4291,13 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
 		if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
 			continue;
 -		VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap);
 +		VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap));
 -		inc_min_seq(lruvec, type);
 +		while (!inc_min_seq(lruvec, type, can_swap)) {
 +			spin_unlock_irq(&lruvec->lru_lock);
 +			cond_resched();
 +			spin_lock_irq(&lruvec->lru_lock);
 +		}
 	}
 	/*
@@ -4301,7 +4334,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
 }
 static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
 -			       struct scan_control *sc, bool can_swap)
 +			       struct scan_control *sc, bool can_swap, bool force_scan)
 {
 	bool success;
 	struct lru_gen_mm_walk *walk;
@@ -4322,7 +4355,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
 	 * handful of PTEs. Spreading the work out over a period of time usually
 	 * is less efficient, but it avoids bursty page faults.
 	 */
 -	if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
 +	if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
 		success = iterate_mm_list_nowalk(lruvec, max_seq);
 		goto done;
 	}
@@ -4336,7 +4369,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
 	walk->lruvec = lruvec;
 	walk->max_seq = max_seq;
 	walk->can_swap = can_swap;
 -	walk->force_scan = false;
 +	walk->force_scan = force_scan;
 	do {
 		success = iterate_mm_list(lruvec, walk, &mm);
@@ -4356,7 +4389,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
 	VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
 -	inc_max_seq(lruvec, can_swap);
 +	inc_max_seq(lruvec, can_swap, force_scan);
 	/* either this sees any waiters or they will see updated max_seq */
 	if (wq_has_sleeper(&lruvec->mm_state.wait))
 		wake_up_all(&lruvec->mm_state.wait);
@@ -4452,7 +4485,7 @@ static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned
 	}
 	if (need_aging)
 -		try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
 +		try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
 	return true;
 }
@@ -5011,7 +5044,7 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
 	if (current_is_kswapd())
 		return 0;
 -	if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap))
 +	if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false))
 		return nr_to_scan;
 done:
 	return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
@@ -5349,6 +5382,361 @@ static struct attribute_group lru_gen_attr_group = {
 	.attrs = lru_gen_attrs,
 };
 +/******************************************************************************
 + *                          debugfs interface
 + ******************************************************************************/
 +
 +static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
 +{
 +	struct mem_cgroup *memcg;
 +	loff_t nr_to_skip = *pos;
 +
 +	m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
 +	if (!m->private)
 +		return ERR_PTR(-ENOMEM);
 +
 +	memcg = mem_cgroup_iter(NULL, NULL, NULL);
 +	do {
 +		int nid;
 +
 +		for_each_node_state(nid, N_MEMORY) {
 +			if (!nr_to_skip--)
 +				return get_lruvec(memcg, nid);
 +		}
 +	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
 +
 +	return NULL;
 +}
 +
 +static void lru_gen_seq_stop(struct seq_file *m, void *v)
 +{
 +	if (!IS_ERR_OR_NULL(v))
 +		mem_cgroup_iter_break(NULL, lruvec_memcg(v));
 +
 +	kvfree(m->private);
 +	m->private = NULL;
 +}
 +
 +static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
 +{
 +	int nid = lruvec_pgdat(v)->node_id;
 +	struct mem_cgroup *memcg = lruvec_memcg(v);
 +
 +	++*pos;
 +
 +	nid = next_memory_node(nid);
 +	if (nid == MAX_NUMNODES) {
 +		memcg = mem_cgroup_iter(NULL, memcg, NULL);
 +		if (!memcg)
 +			return NULL;
 +
 +		nid = first_memory_node;
 +	}
 +
 +	return get_lruvec(memcg, nid);
 +}
 +
 +static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
 +				  unsigned long max_seq, unsigned long *min_seq,
 +				  unsigned long seq)
 +{
 +	int i;
 +	int type, tier;
 +	int hist = lru_hist_from_seq(seq);
 +	struct lru_gen_struct *lrugen = &lruvec->lrugen;
 +
 +	for (tier = 0; tier < MAX_NR_TIERS; tier++) {
 +		seq_printf(m, "            %10d", tier);
 +		for (type = 0; type < ANON_AND_FILE; type++) {
 +			const char *s = "   ";
 +			unsigned long n[3] = {};
 +
 +			if (seq == max_seq) {
 +				s = "RT ";
 +				n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
 +				n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
 +			} else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
 +				s = "rep";
 +				n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
 +				n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
 +				if (tier)
 +					n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
 +			}
 +
 +			for (i = 0; i < 3; i++)
 +				seq_printf(m, " %10lu%c", n[i], s[i]);
 +		}
 +		seq_putc(m, '\n');
 +	}
 +
 +	seq_puts(m, "                      ");
 +	for (i = 0; i < NR_MM_STATS; i++) {
 +		const char *s = "      ";
 +		unsigned long n = 0;
 +
 +		if (seq == max_seq && NR_HIST_GENS == 1) {
 +			s = "LOYNFA";
 +			n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
 +		} else if (seq != max_seq && NR_HIST_GENS > 1) {
 +			s = "loynfa";
 +			n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
 +		}
 +
 +		seq_printf(m, " %10lu%c", n, s[i]);
 +	}
 +	seq_putc(m, '\n');
 +}
 +
 +static int lru_gen_seq_show(struct seq_file *m, void *v)
 +{
 +	unsigned long seq;
 +	bool full = !debugfs_real_fops(m->file)->write;
 +	struct lruvec *lruvec = v;
 +	struct lru_gen_struct *lrugen = &lruvec->lrugen;
 +	int nid = lruvec_pgdat(lruvec)->node_id;
 +	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
 +	DEFINE_MAX_SEQ(lruvec);
 +	DEFINE_MIN_SEQ(lruvec);
 +
 +	if (nid == first_memory_node) {
 +		const char *path = memcg ? m->private : "";
 +
 +#ifdef CONFIG_MEMCG
 +		if (memcg)
 +			cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
 +#endif
 +		seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
 +	}
 +
 +	seq_printf(m, " node %5d\n", nid);
 +
 +	if (!full)
 +		seq = min_seq[LRU_GEN_ANON];
 +	else if (max_seq >= MAX_NR_GENS)
 +		seq = max_seq - MAX_NR_GENS + 1;
 +	else
 +		seq = 0;
 +
 +	for (; seq <= max_seq; seq++) {
 +		int type, zone;
 +		int gen = lru_gen_from_seq(seq);
 +		unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
 +
 +		seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth));
 +
 +		for (type = 0; type < ANON_AND_FILE; type++) {
 +			unsigned long size = 0;
 +			char mark = full && seq < min_seq[type] ? 'x' : ' ';
 +
 +			for (zone = 0; zone < MAX_NR_ZONES; zone++)
 +				size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
 +
 +			seq_printf(m, " %10lu%c", size, mark);
 +		}
 +
 +		seq_putc(m, '\n');
 +
 +		if (full)
 +			lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
 +	}
 +
 +	return 0;
 +}
 +
 +static const struct seq_operations lru_gen_seq_ops = {
 +	.start = lru_gen_seq_start,
 +	.stop = lru_gen_seq_stop,
 +	.next = lru_gen_seq_next,
 +	.show = lru_gen_seq_show,
 +};
 +
 +static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
 +		     bool can_swap, bool force_scan)
 +{
 +	DEFINE_MAX_SEQ(lruvec);
 +	DEFINE_MIN_SEQ(lruvec);
 +
 +	if (seq < max_seq)
 +		return 0;
 +
 +	if (seq > max_seq)
 +		return -EINVAL;
 +
 +	if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
 +		return -ERANGE;
 +
 +	try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan);
 +
 +	return 0;
 +}
 +
 +static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
 +			int swappiness, unsigned long nr_to_reclaim)
 +{
 +	DEFINE_MAX_SEQ(lruvec);
 +
 +	if (seq + MIN_NR_GENS > max_seq)
 +		return -EINVAL;
 +
 +	sc->nr_reclaimed = 0;
 +
 +	while (!signal_pending(current)) {
 +		DEFINE_MIN_SEQ(lruvec);
 +
 +		if (seq < min_seq[!swappiness])
 +			return 0;
 +
 +		if (sc->nr_reclaimed >= nr_to_reclaim)
 +			return 0;
 +
 +		if (!evict_folios(lruvec, sc, swappiness, NULL))
 +			return 0;
 +
 +		cond_resched();
 +	}
 +
 +	return -EINTR;
 +}
 +
 +static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
 +		   struct scan_control *sc, int swappiness, unsigned long opt)
 +{
 +	struct lruvec *lruvec;
 +	int err = -EINVAL;
 +	struct mem_cgroup *memcg = NULL;
 +
 +	if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
 +		return -EINVAL;
 +
 +	if (!mem_cgroup_disabled()) {
 +		rcu_read_lock();
 +		memcg = mem_cgroup_from_id(memcg_id);
 +#ifdef CONFIG_MEMCG
 +		if (memcg && !css_tryget(&memcg->css))
 +			memcg = NULL;
 +#endif
 +		rcu_read_unlock();
 +
 +		if (!memcg)
 +			return -EINVAL;
 +	}
 +
 +	if (memcg_id != mem_cgroup_id(memcg))
 +		goto done;
 +
 +	lruvec = get_lruvec(memcg, nid);
 +
 +	if (swappiness < 0)
 +		swappiness = get_swappiness(lruvec, sc);
 +	else if (swappiness > 200)
 +		goto done;
 +
 +	switch (cmd) {
 +	case '+':
 +		err = run_aging(lruvec, seq, sc, swappiness, opt);
 +		break;
 +	case '-':
 +		err = run_eviction(lruvec, seq, sc, swappiness, opt);
 +		break;
 +	}
 +done:
 +	mem_cgroup_put(memcg);
 +
 +	return err;
 +}
 +
 +static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
 +				 size_t len, loff_t *pos)
 +{
 +	void *buf;
 +	char *cur, *next;
 +	unsigned int flags;
 +	struct blk_plug plug;
 +	int err = -EINVAL;
 +	struct scan_control sc = {
 +		.may_writepage = true,
 +		.may_unmap = true,
 +		.may_swap = true,
 +		.reclaim_idx = MAX_NR_ZONES - 1,
 +		.gfp_mask = GFP_KERNEL,
 +	};
 +
 +	buf = kvmalloc(len + 1, GFP_KERNEL);
 +	if (!buf)
 +		return -ENOMEM;
 +
 +	if (copy_from_user(buf, src, len)) {
 +		kvfree(buf);
 +		return -EFAULT;
 +	}
 +
 +	set_task_reclaim_state(current, &sc.reclaim_state);
 +	flags = memalloc_noreclaim_save();
 +	blk_start_plug(&plug);
 +	if (!set_mm_walk(NULL)) {
 +		err = -ENOMEM;
 +		goto done;
 +	}
 +
 +	next = buf;
 +	next[len] = '\0';
 +
 +	while ((cur = strsep(&next, ",;\n"))) {
 +		int n;
 +		int end;
 +		char cmd;
 +		unsigned int memcg_id;
 +		unsigned int nid;
 +		unsigned long seq;
 +		unsigned int swappiness = -1;
 +		unsigned long opt = -1;
 +
 +		cur = skip_spaces(cur);
 +		if (!*cur)
 +			continue;
 +
 +		n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
 +			   &seq, &end, &swappiness, &end, &opt, &end);
 +		if (n < 4 || cur[end]) {
 +			err = -EINVAL;
 +			break;
 +		}
 +
 +		err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
 +		if (err)
 +			break;
 +	}
 +done:
 +	clear_mm_walk();
 +	blk_finish_plug(&plug);
 +	memalloc_noreclaim_restore(flags);
 +	set_task_reclaim_state(current, NULL);
 +
 +	kvfree(buf);
 +
 +	return err ? : len;
 +}
 +
 +static int lru_gen_seq_open(struct inode *inode, struct file *file)
 +{
 +	return seq_open(file, &lru_gen_seq_ops);
 +}
 +
 +static const struct file_operations lru_gen_rw_fops = {
 +	.open = lru_gen_seq_open,
 +	.read = seq_read,
 +	.write = lru_gen_seq_write,
 +	.llseek = seq_lseek,
 +	.release = seq_release,
 +};
 +
 +static const struct file_operations lru_gen_ro_fops = {
 +	.open = lru_gen_seq_open,
 +	.read = seq_read,
 +	.llseek = seq_lseek,
 +	.release = seq_release,
 +};
 +
 /******************************************************************************
  *                          initialization
  ******************************************************************************/
@@ -5406,6 +5794,9 @@ static int __init init_lru_gen(void)
 	if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
 		pr_err("lru_gen: failed to create sysfs group\n");
 +	debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
 +	debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
 +
 	return 0;
 };
 late_initcall(init_lru_gen);
 -- 
 2.17.1
--- a/target/linux/generic/backport-6.0/112-mm-multi-gen-LRU-admin-guide.patch
+++ b/target/linux/generic/backport-6.0/112-mm-multi-gen-LRU-admin-guide.patch
@ -0,0 +1,265 @@
 From 22199c9b30ffcc332be643577709a2af960e6786 Mon Sep 17 00:00:00 2001
 From: Yu Zhao <yuzhao@google.com>
 Date: Sun, 23 Jan 2022 16:44:43 -0700
 Subject: [PATCH 13/14] mm: multi-gen LRU: admin guide
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 Add an admin guide.
 Signed-off-by: Yu Zhao <yuzhao@google.com>
 Acked-by: Brian Geffon <bgeffon@google.com>
 Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
 Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
 Acked-by: Steven Barrett <steven@liquorix.net>
 Acked-by: Suleiman Souhlal <suleiman@google.com>
 Tested-by: Daniel Byrne <djbyrne@mtu.edu>
 Tested-by: Donald Carr <d@chaos-reins.com>
 Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
 Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
 Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
 Tested-by: Sofia Trinh <sofia.trinh@edi.works>
 Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
 Change-Id: I1902178bcbb5adfa0a748c4d284a6456059bdd7e
 ---
 Documentation/admin-guide/mm/index.rst        |   1 +
 Documentation/admin-guide/mm/multigen_lru.rst | 162 ++++++++++++++++++
 mm/Kconfig                                    |   3 +-
 mm/vmscan.c                                   |   4 +
 4 files changed, 169 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/admin-guide/mm/multigen_lru.rst
 diff --git a/Documentation/admin-guide/mm/index.rst b/Documentation/admin-guide/mm/index.rst
 index 1bd11118dfb1..d1064e0ba34a 100644
 --- a/Documentation/admin-guide/mm/index.rst
 +++ b/Documentation/admin-guide/mm/index.rst
@@ -32,6 +32,7 @@ the Linux memory management.
    idle_page_tracking
    ksm
    memory-hotplug
 +   multigen_lru
    nommu-mmap
    numa_memory_policy
    numaperf
 diff --git a/Documentation/admin-guide/mm/multigen_lru.rst b/Documentation/admin-guide/mm/multigen_lru.rst
 new file mode 100644
 index 000000000000..33e068830497
 --- /dev/null
 +++ b/Documentation/admin-guide/mm/multigen_lru.rst
@@ -0,0 +1,162 @@
 +.. SPDX-License-Identifier: GPL-2.0
 +
 +=============
 +Multi-Gen LRU
 +=============
 +The multi-gen LRU is an alternative LRU implementation that optimizes
 +page reclaim and improves performance under memory pressure. Page
 +reclaim decides the kernel's caching policy and ability to overcommit
 +memory. It directly impacts the kswapd CPU usage and RAM efficiency.
 +
 +Quick start
 +===========
 +Build the kernel with the following configurations.
 +
 +* ``CONFIG_LRU_GEN=y``
 +* ``CONFIG_LRU_GEN_ENABLED=y``
 +
 +All set!
 +
 +Runtime options
 +===============
 +``/sys/kernel/mm/lru_gen/`` contains stable ABIs described in the
 +following subsections.
 +
 +Kill switch
 +-----------
 +``enabled`` accepts different values to enable or disable the
 +following components. Its default value depends on
 +``CONFIG_LRU_GEN_ENABLED``. All the components should be enabled
 +unless some of them have unforeseen side effects. Writing to
 +``enabled`` has no effect when a component is not supported by the
 +hardware, and valid values will be accepted even when the main switch
 +is off.
 +
 +====== ===============================================================
 +Values Components
 +====== ===============================================================
 +0x0001 The main switch for the multi-gen LRU.
 +0x0002 Clearing the accessed bit in leaf page table entries in large
 +       batches, when MMU sets it (e.g., on x86). This behavior can
 +       theoretically worsen lock contention (mmap_lock). If it is
 +       disabled, the multi-gen LRU will suffer a minor performance
 +       degradation for workloads that contiguously map hot pages,
 +       whose accessed bits can be otherwise cleared by fewer larger
 +       batches.
 +0x0004 Clearing the accessed bit in non-leaf page table entries as
 +       well, when MMU sets it (e.g., on x86). This behavior was not
 +       verified on x86 varieties other than Intel and AMD. If it is
 +       disabled, the multi-gen LRU will suffer a negligible
 +       performance degradation.
 +[yYnN] Apply to all the components above.
 +====== ===============================================================
 +
 +E.g.,
 +::
 +
 +    echo y >/sys/kernel/mm/lru_gen/enabled
 +    cat /sys/kernel/mm/lru_gen/enabled
 +    0x0007
 +    echo 5 >/sys/kernel/mm/lru_gen/enabled
 +    cat /sys/kernel/mm/lru_gen/enabled
 +    0x0005
 +
 +Thrashing prevention
 +--------------------
 +Personal computers are more sensitive to thrashing because it can
 +cause janks (lags when rendering UI) and negatively impact user
 +experience. The multi-gen LRU offers thrashing prevention to the
 +majority of laptop and desktop users who do not have ``oomd``.
 +
 +Users can write ``N`` to ``min_ttl_ms`` to prevent the working set of
 +``N`` milliseconds from getting evicted. The OOM killer is triggered
 +if this working set cannot be kept in memory. In other words, this
 +option works as an adjustable pressure relief valve, and when open, it
 +terminates applications that are hopefully not being used.
 +
 +Based on the average human detectable lag (~100ms), ``N=1000`` usually
 +eliminates intolerable janks due to thrashing. Larger values like
 +``N=3000`` make janks less noticeable at the risk of premature OOM
 +kills.
 +
 +The default value ``0`` means disabled.
 +
 +Experimental features
 +=====================
 +``/sys/kernel/debug/lru_gen`` accepts commands described in the
 +following subsections. Multiple command lines are supported, so does
 +concatenation with delimiters ``,`` and ``;``.
 +
 +``/sys/kernel/debug/lru_gen_full`` provides additional stats for
 +debugging. ``CONFIG_LRU_GEN_STATS=y`` keeps historical stats from
 +evicted generations in this file.
 +
 +Working set estimation
 +----------------------
 +Working set estimation measures how much memory an application needs
 +in a given time interval, and it is usually done with little impact on
 +the performance of the application. E.g., data centers want to
 +optimize job scheduling (bin packing) to improve memory utilizations.
 +When a new job comes in, the job scheduler needs to find out whether
 +each server it manages can allocate a certain amount of memory for
 +this new job before it can pick a candidate. To do so, the job
 +scheduler needs to estimate the working sets of the existing jobs.
 +
 +When it is read, ``lru_gen`` returns a histogram of numbers of pages
 +accessed over different time intervals for each memcg and node.
 +``MAX_NR_GENS`` decides the number of bins for each histogram. The
 +histograms are noncumulative.
 +::
 +
 +    memcg  memcg_id  memcg_path
 +       node  node_id
 +           min_gen_nr  age_in_ms  nr_anon_pages  nr_file_pages
 +           ...
 +           max_gen_nr  age_in_ms  nr_anon_pages  nr_file_pages
 +
 +Each bin contains an estimated number of pages that have been accessed
 +within ``age_in_ms``. E.g., ``min_gen_nr`` contains the coldest pages
 +and ``max_gen_nr`` contains the hottest pages, since ``age_in_ms`` of
 +the former is the largest and that of the latter is the smallest.
 +
 +Users can write the following command to ``lru_gen`` to create a new
 +generation ``max_gen_nr+1``:
 +
 +    ``+ memcg_id node_id max_gen_nr [can_swap [force_scan]]``
 +
 +``can_swap`` defaults to the swap setting and, if it is set to ``1``,
 +it forces the scan of anon pages when swap is off, and vice versa.
 +``force_scan`` defaults to ``1`` and, if it is set to ``0``, it
 +employs heuristics to reduce the overhead, which is likely to reduce
 +the coverage as well.
 +
 +A typical use case is that a job scheduler runs this command at a
 +certain time interval to create new generations, and it ranks the
 +servers it manages based on the sizes of their cold pages defined by
 +this time interval.
 +
 +Proactive reclaim
 +-----------------
 +Proactive reclaim induces page reclaim when there is no memory
 +pressure. It usually targets cold pages only. E.g., when a new job
 +comes in, the job scheduler wants to proactively reclaim cold pages on
 +the server it selected, to improve the chance of successfully landing
 +this new job.
 +
 +Users can write the following command to ``lru_gen`` to evict
 +generations less than or equal to ``min_gen_nr``.
 +
 +    ``- memcg_id node_id min_gen_nr [swappiness [nr_to_reclaim]]``
 +
 +``min_gen_nr`` should be less than ``max_gen_nr-1``, since
 +``max_gen_nr`` and ``max_gen_nr-1`` are not fully aged (equivalent to
 +the active list) and therefore cannot be evicted. ``swappiness``
 +overrides the default value in ``/proc/sys/vm/swappiness``.
 +``nr_to_reclaim`` limits the number of pages to evict.
 +
 +A typical use case is that a job scheduler runs this command before it
 +tries to land a new job on a server. If it fails to materialize enough
 +cold pages because of the overestimation, it retries on the next
 +server according to the ranking result obtained from the working set
 +estimation step. This less forceful approach limits the impacts on the
 +existing jobs.
 diff --git a/mm/Kconfig b/mm/Kconfig
 index 6c86849c4db9..96cd3ae25c6f 100644
 --- a/mm/Kconfig
 +++ b/mm/Kconfig
@@ -1131,7 +1131,8 @@ config LRU_GEN
 	# make sure folio->flags has enough spare bits
 	depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP
 	help
 -	  A high performance LRU implementation to overcommit memory.
 +	  A high performance LRU implementation to overcommit memory. See
 +	  Documentation/admin-guide/mm/multigen_lru.rst for details.
 config LRU_GEN_ENABLED
 	bool "Enable by default"
 diff --git a/mm/vmscan.c b/mm/vmscan.c
 index 3e7aad06299b..146a54cf1bd9 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
@@ -5307,6 +5307,7 @@ static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, c
 	return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
 }
 +/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
 static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
 			     const char *buf, size_t len)
 {
@@ -5340,6 +5341,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
 	return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
 }
 +/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
 static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
 			     const char *buf, size_t len)
 {
@@ -5487,6 +5489,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
 	seq_putc(m, '\n');
 }
 +/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
 static int lru_gen_seq_show(struct seq_file *m, void *v)
 {
 	unsigned long seq;
@@ -5645,6 +5648,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
 	return err;
 }
 +/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
 static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
 				 size_t len, loff_t *pos)
 {
 -- 
 2.17.1
--- a/target/linux/generic/backport-6.0/113-mm-multi-gen-LRU-design-doc.patch
+++ b/target/linux/generic/backport-6.0/113-mm-multi-gen-LRU-design-doc.patch
@ -0,0 +1,210 @@
 From bd82a74f6b5c0a75ef61be5e9be34319bb17328f Mon Sep 17 00:00:00 2001
 From: Yu Zhao <yuzhao@google.com>
 Date: Sun, 6 Mar 2022 20:35:00 -0700
 Subject: [PATCH 14/14] mm: multi-gen LRU: design doc
 MIME-Version: 1.0
 Content-Type: text/plain; charset=UTF-8
 Content-Transfer-Encoding: 8bit
 Add a design doc.
 Signed-off-by: Yu Zhao <yuzhao@google.com>
 Acked-by: Brian Geffon <bgeffon@google.com>
 Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
 Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
 Acked-by: Steven Barrett <steven@liquorix.net>
 Acked-by: Suleiman Souhlal <suleiman@google.com>
 Tested-by: Daniel Byrne <djbyrne@mtu.edu>
 Tested-by: Donald Carr <d@chaos-reins.com>
 Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
 Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
 Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
 Tested-by: Sofia Trinh <sofia.trinh@edi.works>
 Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
 Change-Id: I958afcabf5abc37b3e58f72638d35a349c31b98d
 ---
 Documentation/mm/index.rst        |   1 +
 Documentation/mm/multigen_lru.rst | 159 ++++++++++++++++++++++++++++++
 2 files changed, 160 insertions(+)
 create mode 100644 Documentation/mm/multigen_lru.rst
 diff --git a/Documentation/mm/index.rst b/Documentation/mm/index.rst
 index 575ccd40e30c..4aa12b8be278 100644
 --- a/Documentation/mm/index.rst
 +++ b/Documentation/mm/index.rst
@@ -51,6 +51,7 @@ above structured documentation, or deleted if it has served its purpose.
    ksm
    memory-model
    mmu_notifier
 +   multigen_lru
    numa
    overcommit-accounting
    page_migration
 diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst
 new file mode 100644
 index 000000000000..d7062c6a8946
 --- /dev/null
 +++ b/Documentation/mm/multigen_lru.rst
@@ -0,0 +1,159 @@
 +.. SPDX-License-Identifier: GPL-2.0
 +
 +=============
 +Multi-Gen LRU
 +=============
 +The multi-gen LRU is an alternative LRU implementation that optimizes
 +page reclaim and improves performance under memory pressure. Page
 +reclaim decides the kernel's caching policy and ability to overcommit
 +memory. It directly impacts the kswapd CPU usage and RAM efficiency.
 +
 +Design overview
 +===============
 +Objectives
 +----------
 +The design objectives are:
 +
 +* Good representation of access recency
 +* Try to profit from spatial locality
 +* Fast paths to make obvious choices
 +* Simple self-correcting heuristics
 +
 +The representation of access recency is at the core of all LRU
 +implementations. In the multi-gen LRU, each generation represents a
 +group of pages with similar access recency. Generations establish a
 +(time-based) common frame of reference and therefore help make better
 +choices, e.g., between different memcgs on a computer or different
 +computers in a data center (for job scheduling).
 +
 +Exploiting spatial locality improves efficiency when gathering the
 +accessed bit. A rmap walk targets a single page and does not try to
 +profit from discovering a young PTE. A page table walk can sweep all
 +the young PTEs in an address space, but the address space can be too
 +sparse to make a profit. The key is to optimize both methods and use
 +them in combination.
 +
 +Fast paths reduce code complexity and runtime overhead. Unmapped pages
 +do not require TLB flushes; clean pages do not require writeback.
 +These facts are only helpful when other conditions, e.g., access
 +recency, are similar. With generations as a common frame of reference,
 +additional factors stand out. But obvious choices might not be good
 +choices; thus self-correction is necessary.
 +
 +The benefits of simple self-correcting heuristics are self-evident.
 +Again, with generations as a common frame of reference, this becomes
 +attainable. Specifically, pages in the same generation can be
 +categorized based on additional factors, and a feedback loop can
 +statistically compare the refault percentages across those categories
 +and infer which of them are better choices.
 +
 +Assumptions
 +-----------
 +The protection of hot pages and the selection of cold pages are based
 +on page access channels and patterns. There are two access channels:
 +
 +* Accesses through page tables
 +* Accesses through file descriptors
 +
 +The protection of the former channel is by design stronger because:
 +
 +1. The uncertainty in determining the access patterns of the former
 +   channel is higher due to the approximation of the accessed bit.
 +2. The cost of evicting the former channel is higher due to the TLB
 +   flushes required and the likelihood of encountering the dirty bit.
 +3. The penalty of underprotecting the former channel is higher because
 +   applications usually do not prepare themselves for major page
 +   faults like they do for blocked I/O. E.g., GUI applications
 +   commonly use dedicated I/O threads to avoid blocking rendering
 +   threads.
 +
 +There are also two access patterns:
 +
 +* Accesses exhibiting temporal locality
 +* Accesses not exhibiting temporal locality
 +
 +For the reasons listed above, the former channel is assumed to follow
 +the former pattern unless ``VM_SEQ_READ`` or ``VM_RAND_READ`` is
 +present, and the latter channel is assumed to follow the latter
 +pattern unless outlying refaults have been observed.
 +
 +Workflow overview
 +=================
 +Evictable pages are divided into multiple generations for each
 +``lruvec``. The youngest generation number is stored in
 +``lrugen->max_seq`` for both anon and file types as they are aged on
 +an equal footing. The oldest generation numbers are stored in
 +``lrugen->min_seq[]`` separately for anon and file types as clean file
 +pages can be evicted regardless of swap constraints. These three
 +variables are monotonically increasing.
 +
 +Generation numbers are truncated into ``order_base_2(MAX_NR_GENS+1)``
 +bits in order to fit into the gen counter in ``folio->flags``. Each
 +truncated generation number is an index to ``lrugen->lists[]``. The
 +sliding window technique is used to track at least ``MIN_NR_GENS`` and
 +at most ``MAX_NR_GENS`` generations. The gen counter stores a value
 +within ``[1, MAX_NR_GENS]`` while a page is on one of
 +``lrugen->lists[]``; otherwise it stores zero.
 +
 +Each generation is divided into multiple tiers. A page accessed ``N``
 +times through file descriptors is in tier ``order_base_2(N)``. Unlike
 +generations, tiers do not have dedicated ``lrugen->lists[]``. In
 +contrast to moving across generations, which requires the LRU lock,
 +moving across tiers only involves atomic operations on
 +``folio->flags`` and therefore has a negligible cost. A feedback loop
 +modeled after the PID controller monitors refaults over all the tiers
 +from anon and file types and decides which tiers from which types to
 +evict or protect.
 +
 +There are two conceptually independent procedures: the aging and the
 +eviction. They form a closed-loop system, i.e., the page reclaim.
 +
 +Aging
 +-----
 +The aging produces young generations. Given an ``lruvec``, it
 +increments ``max_seq`` when ``max_seq-min_seq+1`` approaches
 +``MIN_NR_GENS``. The aging promotes hot pages to the youngest
 +generation when it finds them accessed through page tables; the
 +demotion of cold pages happens consequently when it increments
 +``max_seq``. The aging uses page table walks and rmap walks to find
 +young PTEs. For the former, it iterates ``lruvec_memcg()->mm_list``
 +and calls ``walk_page_range()`` with each ``mm_struct`` on this list
 +to scan PTEs, and after each iteration, it increments ``max_seq``. For
 +the latter, when the eviction walks the rmap and finds a young PTE,
 +the aging scans the adjacent PTEs. For both, on finding a young PTE,
 +the aging clears the accessed bit and updates the gen counter of the
 +page mapped by this PTE to ``(max_seq%MAX_NR_GENS)+1``.
 +
 +Eviction
 +--------
 +The eviction consumes old generations. Given an ``lruvec``, it
 +increments ``min_seq`` when ``lrugen->lists[]`` indexed by
 +``min_seq%MAX_NR_GENS`` becomes empty. To select a type and a tier to
 +evict from, it first compares ``min_seq[]`` to select the older type.
 +If both types are equally old, it selects the one whose first tier has
 +a lower refault percentage. The first tier contains single-use
 +unmapped clean pages, which are the best bet. The eviction sorts a
 +page according to its gen counter if the aging has found this page
 +accessed through page tables and updated its gen counter. It also
 +moves a page to the next generation, i.e., ``min_seq+1``, if this page
 +was accessed multiple times through file descriptors and the feedback
 +loop has detected outlying refaults from the tier this page is in. To
 +this end, the feedback loop uses the first tier as the baseline, for
 +the reason stated earlier.
 +
 +Summary
 +-------
 +The multi-gen LRU can be disassembled into the following parts:
 +
 +* Generations
 +* Rmap walks
 +* Page table walks
 +* Bloom filters
 +* PID controller
 +
 +The aging and the eviction form a producer-consumer model;
 +specifically, the latter drives the former by the sliding window over
 +generations. Within the aging, rmap walks drive page table walks by
 +inserting hot densely populated page tables to the Bloom filters.
 +Within the eviction, the PID controller uses refaults as the feedback
 +to select types to evict and tiers to protect.
 -- 
 2.17.1