From 1a64e7f4993553eb6a691daf4b65e42db26e56dc Mon Sep 17 00:00:00 2001
From: Fabian Keil <fk@fabiankeil.de>
Date: Tue, 22 Sep 2015 16:05:49 +0200
Subject: [PATCH 239/257] sys/vm: Limit the inactive pages more aggressively

Currently the ZFS ARC does not take the inactive pages into account
when calculating its target size. If there's no limit for the inactive
pages, the ARC may shrink to its own limit while the number of
inactive pages continues to grow:

last pid: 28429;  load averages:  0.48,  0.46,  0.41    up 0+03:39:07  17:24:59
91 processes:  2 running, 88 sleeping, 1 waiting
CPU:  1.4% user,  0.0% nice, 12.7% system,  0.2% interrupt, 85.7% idle
Mem: 396M Active, 489M Inact, 986M Wired, 292K Cache, 5202K Buf, 43M Free
ARC: 351M Total, 90M MFU, 44M MRU, 6839K Anon, 7810K Header, 203M Other, 350M Target
Swap: 2048M Total, 99M Used, 1949M Free, 4% Inuse

  PID USERNAME    THR PRI NICE   SIZE    RES STATE   C   TIME    WCPU COMMAND
   11 root          2 155 ki31     0K    32K RUN     0 377:37 170.34% idle
26625 fk           17  36    0   175M 24504K uwait   1   0:09   8.40% git
    0 root        468 -16    0     0K  7488K swapin  1   3:29   6.26% kernel
   22 root          1  20    -     0K    16K geli:w  1   4:16   5.06% g_eli[1] ada0s1d
[...]
2015 Sep 21 17:24:58: Scan goals in the previous minute:
  Update active LRU/deactivate pages                               60
2015 Sep 21 17:24:58: Seconds since last 'Move inactive to cache or free' pass: 1477
2015 Sep 21 17:24:58: Seconds since last 'Launder dirty pages' pass: 9273

With this commit, the system lets the ARC indirectly put
pressure on the inactive pages until a given target is reached.

A couple of sysctls can be used to set various limits, the
auto-tuned default should work reasonably well, though.
Note that suboptimal tuning can result in excessive paging.

Screenshot (made with previous version of this commit):
https://www.fabiankeil.de/bilder/electrobsd/kernel-compilation-with-inactive-page-limit-enabled.png

XXX: After rebasing on r300865 this commit caused a bunch
     of conflicts that may not have been addressed ideally.

Obtained from: ElectroBSD
---
 sys/vm/vm_pageout.c | 194 ++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 157 insertions(+), 37 deletions(-)

diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index 3e36f7c1c3fa..141c087becfa 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -140,15 +140,17 @@ SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start,
 
 SDT_PROVIDER_DEFINE(vm);
 SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan);
-SDT_PROBE_DEFINE4(vm, , , before__inactive__scan, "struct vm_domain *vmd",
-    "int pass", "int page_shortage", "int deficit");
-SDT_PROBE_DEFINE5(vm, , , after__inactive__scan, "struct vm_domain *vmd",
+SDT_PROBE_DEFINE5(vm, , , before__inactive__scan, "struct vm_domain *vmd",
+    "int pass", "int page_shortage", "int deficit", "int inactive_page_surplus");
+SDT_PROBE_DEFINE6(vm, , , after__inactive__scan, "struct vm_domain *vmd",
     "int pass", "int page_shortage", "int addl_page_shortage",
-    "int vnodes_skipped");
+    "int vnodes_skipped", "int inactive_page_surplus");
 SDT_PROBE_DEFINE3(vm, , , before__active__scan, "struct vm_domain *vmd",
     "int pass", "int page_shortage");
 SDT_PROBE_DEFINE3(vm, , , after__active__scan, "struct vm_domain *vmd",
     "int pass", "int page_shortage");
+SDT_PROBE_DEFINE3(vm, , , checked__inactive__pages, "int pages_to_free",
+    "int pages_above_limit", "int enforced_limit");
 
 #if !defined(NO_SWAPPING)
 /* the kernel process "vm_daemon"*/
@@ -230,6 +232,36 @@ SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts,
 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts,
 	CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages");
 
+static int inactive_page_limit_enabled = 1;
+SYSCTL_INT(_vm, OID_AUTO, inactive_page_limit_enabled, CTLFLAG_RW,
+	&inactive_page_limit_enabled, 0,
+	"Free inactive pages above the target more aggressively. "
+	"Values: 0 (disabled), 1 (soft mode, only apply limit if free "
+	"page count is low), 2 (ignore free count)");
+
+static int inactive_page_limit_offset_i;
+SYSCTL_INT(_vm, OID_AUTO, inactive_page_limit_offset_i, CTLFLAG_RW,
+	&inactive_page_limit_offset_i, 0,
+	"Number of inactive pages relative to the inactive target "
+	"required for inactive pages to be freed.");
+
+static int inactive_page_limit_offset_f;
+SYSCTL_INT(_vm, OID_AUTO, inactive_page_limit_offset_f, CTLFLAG_RW,
+	&inactive_page_limit_offset_f, 0,
+	"Number of free pages relative to the free target required for "
+	"the inactive memory limit to be applied.");
+
+static int inactive_pages_to_free_max = 1000;
+SYSCTL_INT(_vm, OID_AUTO, inactive_pages_to_free_max, CTLFLAG_RW,
+        &inactive_pages_to_free_max, 0,
+        "Maximum number of inactive pages above the target to free at once.");
+
+static int inactive_page_limit_threshold = 1000;
+SYSCTL_INT(_vm, OID_AUTO, inactive_page_limit_threshold, CTLFLAG_RW,
+	&inactive_page_limit_threshold, 0,
+	"Number of inactive pages above the limit required "
+	"to trigger a inactive page reduction.");
+
 static int pageout_lock_miss;
 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
 	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
@@ -848,6 +880,41 @@ unlock_mp:
 	return (error);
 }
 
+static int
+vm_pageout_get_inactive_page_surplus(void)
+{
+	int pages_to_free;
+	int pages_above_limit;
+	int enforced_limit;
+
+	/* Return early so the DTrace probe does not fire. */
+	if (!inactive_page_limit_enabled)
+		return (0);
+
+	enforced_limit = vm_cnt.v_inactive_target + inactive_page_limit_offset_i;
+	pages_above_limit = vm_cnt.v_inactive_count - enforced_limit;
+
+	/*
+	 * We want to free inactive pages if the threshold of inactive
+	 * pages above the limit is reached and we are either using
+	 * a hard limit, or the number of free pages is below the
+	 * free page limit.
+	 */
+	if ((pages_above_limit >= inactive_page_limit_threshold) &&
+	    ((inactive_page_limit_enabled == 2) ||
+	     (vm_paging_target() + inactive_page_limit_offset_f > 0))) {
+		pages_to_free = imin(inactive_pages_to_free_max,
+		    pages_above_limit);
+	} else {
+		pages_to_free = 0;
+	}
+
+	SDT_PROBE3(vm, , , checked__inactive__pages, pages_to_free,
+	    pages_above_limit, enforced_limit);
+
+	return (pages_to_free);
+}
+
 #define VMD_PASS_MAX 3
 /*
  *	vm_pageout_scan does the dirty work for the pageout daemon.
@@ -870,15 +937,35 @@ vm_pageout_scan(struct vm_domain *vmd, int pass)
 	int maxlaunder, maxscan, page_shortage, scan_tick, scanned;
 	int starting_page_shortage, vnodes_skipped;
 	boolean_t pageout_ok, queue_locked;
+	int inactive_page_surplus;
 
 	KASSERT(pass <= VMD_PASS_MAX,
 	    ("vm_pageout_scan: Invalid pass code %d", pass));
 
 	/*
+	 * The addl_page_shortage is the number of temporarily
+	 * stuck pages in the inactive queue.  In other words, the
+	 * number of pages from the inactive count that should be
+	 * discounted in setting the target for the active queue scan.
+	 */
+	addl_page_shortage = 0;
+
+	/*
+	 * Calculate the number of pages that we want to free.
+	 */
+	if (pass > 0) {
+		deficit = atomic_readandclear_int(&vm_pageout_deficit);
+		page_shortage = vm_paging_target() + deficit;
+		inactive_page_surplus = vm_pageout_get_inactive_page_surplus();
+	} else
+		page_shortage = deficit = inactive_page_surplus = 0;
+	starting_page_shortage = page_shortage;
+
+	/*
 	 * If we need to reclaim memory ask kernel caches to return
 	 * some.  We rate limit to avoid thrashing.
 	 */
-	if (vmd == &vm_dom[0] && pass > 0 &&
+	if (vmd == &vm_dom[0] && pass > 0 && page_shortage > 0 &&
 	    (time_uptime - lowmem_uptime) >= lowmem_period) {
 		/*
 		 * Decrease registered cache sizes.
@@ -894,26 +981,6 @@ vm_pageout_scan(struct vm_domain *vmd, int pass)
 	}
 
 	/*
-	 * The addl_page_shortage is the number of temporarily
-	 * stuck pages in the inactive queue.  In other words, the
-	 * number of pages from the inactive count that should be
-	 * discounted in setting the target for the active queue scan.
-	 */
-	addl_page_shortage = 0;
-
-	/*
-	 * Calculate the number of pages that we want to free.  This number
-	 * can be negative if many pages are freed between the wakeup call to
-	 * the page daemon and this calculation.
-	 */
-	if (pass > 0) {
-		deficit = atomic_readandclear_int(&vm_pageout_deficit);
-		page_shortage = vm_paging_target() + deficit;
-	} else
-		page_shortage = deficit = 0;
-	starting_page_shortage = page_shortage;
-
-	/*
 	 * maxlaunder limits the number of dirty pages we flush per scan.
 	 * For most systems a smaller value (16 or 32) is more robust under
 	 * extreme memory and disk pressure because any unnecessary writes
@@ -928,10 +995,18 @@ vm_pageout_scan(struct vm_domain *vmd, int pass)
 	if (pass > 1)
 		maxlaunder = 10000;
 
+	/*
+         * Prevent laundering if there's no page shortage and we are
+         * merely trying to free inactive pages. Otherwise we may end
+         * up swapping before it's really necessary.
+	 */
+	if (page_shortage <= 0)
+		maxlaunder = 0;
+
 	vnodes_skipped = 0;
 
-	SDT_PROBE4(vm, , , before__inactive__scan, vmd, pass, page_shortage,
-	    deficit);
+	SDT_PROBE5(vm, , , before__inactive__scan, vmd, pass, page_shortage,
+	    deficit, inactive_page_surplus);
 
 	/*
 	 * Start scanning the inactive queue for pages that we can free.  The
@@ -944,7 +1019,8 @@ vm_pageout_scan(struct vm_domain *vmd, int pass)
 	vm_pagequeue_lock(pq);
 	queue_locked = TRUE;
 	for (m = TAILQ_FIRST(&pq->pq_pl);
-	     m != NULL && maxscan-- > 0 && page_shortage > 0;
+	     m != NULL && maxscan-- > 0 &&
+		(page_shortage > 0 || inactive_page_surplus > 0);
 	     m = next) {
 		vm_pagequeue_assert_locked(pq);
 		KASSERT(queue_locked, ("unlocked inactive queue"));
@@ -1080,6 +1156,7 @@ free_page:
 			vm_page_free(m);
 			PCPU_INC(cnt.v_dfree);
 			--page_shortage;
+			--inactive_page_surplus;
 		} else if ((object->flags & OBJ_DEAD) != 0) {
 			/*
 			 * Leave dirty pages from dead objects at the front of
@@ -1107,13 +1184,19 @@ requeue_page:
 			vm_pagequeue_lock(pq);
 			queue_locked = TRUE;
 			vm_page_requeue_locked(m);
-		} else if (maxlaunder > 0) {
+		} else if (maxlaunder > 0 && page_shortage > 0) {
 			/*
-			 * We always want to try to flush some dirty pages if
-			 * we encounter them, to keep the system stable.
+			 * As long as there is a page shortage, we try to
+			 * flush some dirty pages if we encounter them, to
+			 * keep the system stable.
 			 * Normally this number is small, but under extreme
 			 * pressure where there are insufficient clean pages
 			 * on the inactive queue, we may have to go all out.
+			 *
+			 * XXX: We probably should not bother laundering
+			 *	until we know that there might be a chance
+			 *	that we will not be able to free the required
+			 *	amount of pages to take care of the page shortage.
 			 */
 
 			if (object->type != OBJT_SWAP &&
@@ -1158,15 +1241,27 @@ relock_queue:
 	}
 	vm_pagequeue_unlock(pq);
 
-	SDT_PROBE5(vm, , , after__inactive__scan, vmd, pass, page_shortage,
-            addl_page_shortage, vnodes_skipped);
+	/*
+	 * If the page shortage has been taken care of, or if we were
+	 * just trying to free surplus inactive pages, the locked pages
+	 * are more or less meaningless. Reset the counter to prevent
+	 * pointless swapping.
+	 */
+	if (page_shortage <= 0)
+		addl_page_shortage = 0;
+
+	SDT_PROBE6(vm, , , after__inactive__scan, vmd, pass, page_shortage,
+	    addl_page_shortage, vnodes_skipped, inactive_page_surplus);
 
 #if !defined(NO_SWAPPING)
 	/*
-	 * Wakeup the swapout daemon if we didn't free the targeted number of
-	 * pages.
+	 * Wakeup the swapout daemon if we didn't free the targeted number
+	 * of pages and we are either desperate or there are no inactive
+	 * pages to free left (in which case we will be desperate soon
+	 * enough).
 	 */
-	if (vm_swap_enabled && page_shortage > 0)
+	if (vm_swap_enabled && page_shortage > 0 &&
+           (pass > 1 || !vm_pageout_get_inactive_page_surplus()))
 		vm_req_vmdaemon(VM_SWAP_NORMAL);
 #endif
 
@@ -1587,7 +1682,9 @@ vm_pageout_worker(void *arg)
 		/*
 		 * Might the page daemon receive a wakeup call?
 		 */
-		if (vm_pageout_wanted) {
+		/* XXX: After r300865 this may no longer work. Investigate! */
+		if (vm_pageout_wanted ||
+                    vm_pageout_get_inactive_page_surplus() > 0) {
 			/*
 			 * No.  Either vm_pageout_wanted was set by another
 			 * thread during the previous scan, which must have
@@ -1656,6 +1753,29 @@ vm_pageout_init(void)
 		vm_cnt.v_inactive_target = vm_cnt.v_free_count / 3;
 
 	/*
+	 * Apply the inactive memory limit before ZFS's
+	 * dirty data limit kicks in.
+	 */
+	inactive_page_limit_offset_f = vm_cnt.v_page_count / 9
+	    - vm_cnt.v_free_target;
+
+	/*
+	 * A man carrying a large stone through the savanne was asked why
+	 * he would do such a strange thing. His explanation: "If a lion
+	 * comes, I'll through away the stone which will allow me to run
+	 * faster."
+	 *
+	 * For similar reasons the inactive page limit defaults to
+	 * allowing twice the number of inactive pages the vm targets
+	 * itself.
+	 *
+	 * If sudden memory pressure comes, the inactive page reserve can
+	 * be thrown away to make it less likely that the system has to
+	 * start paging.
+	 */
+	inactive_page_limit_offset_i = vm_cnt.v_inactive_target;
+
+	/*
 	 * Set the default wakeup threshold to be 10% above the minimum
 	 * page limit.  This keeps the steady state out of shortfall.
 	 */
-- 
2.11.0