From fff4a4928ef2befc700c726a1b29fa792b5ae21c Mon Sep 17 00:00:00 2001 From: Fabian Keil Date: Thu, 3 Sep 2015 13:58:29 +0200 Subject: [PATCH 1/2] ZFS ARC: Move the ARC cache reaping out of the hot path ... and use a time-based rate limiting approach that can be controlled with the vfs.zfs.arc_reap_delay_min sysctl. Previously memory pressure could result in several thousand ARC reaping attempts per second. This wasn't merely inefficient but also caused lock contention and poor latency for ZFS operations that relied on arc_get_data_buf(). For details see: https://www.fabiankeil.de/gehacktes/electrobsd/zfs-arc-tuning/ Obtained from: ElectroBSD --- .../contrib/opensolaris/uts/common/fs/zfs/arc.c | 98 ++++++++++++++-------- 1 file changed, 64 insertions(+), 34 deletions(-) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index 2b8b4d7..df53256 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -3529,8 +3529,21 @@ extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; extern kmem_cache_t *range_seg_cache; -static void __used -reap_arc_caches() +/* + * Used by arc_kmem_reap_now() and consider_reaping_arc_caches() + * to limit the time spent reaping. + * + * The arc_reaping_in_progress is a (somewhat racy) left over from a + * previous version of this code which could trigger multiple ARC cache + * reapings in parallel which should be avoided to reduce lock + * contention. It's hasn't been removed yet to encourage further + * experimenting. + */ +static unsigned int arc_reaping_in_progress = 0; +static sbintime_t last_reaping = 0; + +static void __noinline +reap_arc_caches(void) { size_t i; kmem_cache_t *prev_cache = NULL; @@ -3556,6 +3569,8 @@ static __noinline void arc_kmem_reap_now(void) { DTRACE_PROBE(arc__kmem_reap_start); + arc_reaping_in_progress++; + #ifdef _KERNEL if (arc_meta_used >= arc_meta_limit) { /* @@ -3583,9 +3598,50 @@ arc_kmem_reap_now(void) vmem_qcache_reap(zio_arena); } #endif +#ifdef _KERNEL + last_reaping = getsbinuptime(); +#endif + arc_reaping_in_progress = 0; DTRACE_PROBE(arc__kmem_reap_end); } + +/* + * Declared writable to allow resetting it. + * XXX: Should probably be a uint64 and integrated with kstat. + */ +static unsigned int arc_cache_reapings_skipped = 0; +SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_cache_reapings_skipped, CTLFLAG_RW, + &arc_cache_reapings_skipped, 0, "Number of times the ARC caches have not been reaped due to the reap delay"); + +static unsigned int min_arc_reap_delay = 200; +SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_reap_delay_min, CTLFLAG_RW, + &min_arc_reap_delay, 200, "Minimum delay between ARC cache reapings (milliseconds)"); + +static void __noinline +consider_reaping_arc_caches(void) +{ +#ifdef _KERNEL + sbintime_t now; + + if (arc_reaping_in_progress) + { + /* Already reaping in another thread. */ + arc_cache_reapings_skipped++; + return; + } + + now = getsbinuptime(); + if ((now - last_reaping) / SBT_1MS < min_arc_reap_delay) + { + /* Too soon to reap again. */ + arc_cache_reapings_skipped++; + return; + } +#endif + arc_kmem_reap_now(); +} + /* * Threads can block in arc_get_data_buf() waiting for this thread to evict * enough data and signal them to proceed. When this happens, the threads in @@ -3607,28 +3663,11 @@ arc_reclaim_thread(void *dummy __unused) { clock_t growtime = 0; callb_cpr_t cpr; - int autoreap = 0; CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); mutex_enter(&arc_reclaim_lock); while (!arc_reclaim_thread_exit) { -#ifdef _KERNEL -/* KD 2015-02-10 - * Protect against UMA free memory bloat. We already do this on a low-memory - * basis in the allocator; it has to happen there rather than here due to - * response time considerations. Make the call here once every 10 passes as - * well; this reclaims unused UMA buffers every 10 seconds on an idle system - * and more frequently if the reclaim thread gets woken up by low RAM - * conditions. - */ - if ((zio_use_uma) && (autoreap++ == 10)) { - autoreap = 0; - DTRACE_PROBE(arc__reclaim_timed_reap); - reap_arc_caches(); - } -#endif /* _KERNEL */ - int64_t free_memory = arc_available_memory(); uint64_t evicted = 0; @@ -3645,8 +3684,6 @@ arc_reclaim_thread(void *dummy __unused) */ growtime = ddi_get_lbolt() + (arc_grow_retry * hz); - arc_kmem_reap_now(); - /* * If we are still low on memory, shrink the ARC * so that we have arc_shrink_min free space. @@ -3720,6 +3757,12 @@ arc_user_evicts_thread(void *dummy __unused) while (!arc_user_evicts_thread_exit) { mutex_exit(&arc_user_evicts_lock); + /* + * Consider reaping the ARC caches at least once per + * second, but more often when signalled under pressure. + */ + consider_reaping_arc_caches(); + arc_do_user_evicts(); /* @@ -3894,19 +3937,6 @@ arc_get_data_buf(arc_buf_t *buf) arc_space_consume(size, ARC_SPACE_META); } else { ASSERT(type == ARC_BUFC_DATA); -#ifdef _KERNEL -/* KD 2015-02-10 - * It would be nice if we could leave this to the arc_reclaim thread. - * Unfortunately we cannot; the test has to be done here as well, because - * under heavy I/O demand we can grab enough RAM fast enough to induce - * nasty oscillation problems. Fortunately we only need to call this when - * the system is under reasonably-severe memory stress. - */ - if (zio_use_uma && (ptob(vm_cnt.v_free_count) + size < ptob(vm_cnt.v_free_target))) { - DTRACE_PROBE3(arc__alloc_lowmem_reap, int, vm_cnt.v_free_count, int, size, int, vm_cnt.v_free_target); - reap_arc_caches(); - } -#endif /* _KERNEL */ buf->b_data = zio_data_buf_alloc(size); arc_space_consume(size, ARC_SPACE_DATA); } -- 2.5.1 From bc934ed405a6de2e924c7b59a5355383ab2e81a5 Mon Sep 17 00:00:00 2001 From: Fabian Keil Date: Fri, 4 Sep 2015 20:07:22 +0200 Subject: [PATCH 2/2] dtrace: Import fbt-time which llquantizes the time spent in a given kernel function Among other things this can be useful for general system tuning and analysis of lock contention (once you know where to expect it). Obtained from: ElectroBSD --- share/dtrace/fbt-time | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100755 share/dtrace/fbt-time diff --git a/share/dtrace/fbt-time b/share/dtrace/fbt-time new file mode 100755 index 0000000..27a950b --- /dev/null +++ b/share/dtrace/fbt-time @@ -0,0 +1,80 @@ +#!/usr/sbin/dtrace -s + +/*************************************************************************** + * fbt-time + * + * Measures the time spent in a given kernel function and llquantizes + * the results for the traced calls. Example output can be seen at: + * http://www.fabiankeil.de/gehacktes/electrobsd/zfs-arc-tuning/ + * + * WARNING: Tracing kernel functions that get executed several thousand + * times per second may negatively impact system performance + * and can even cause brief lockups. Before using this script + * on production systems you may want to sample the traced + * kernel function first. + * + * XXX: This script should be fleshed out to be more resilient to + * incorrect input and provide more awesome stats. A better name + * would be nice, too. + * + * Copyright (c) 2014 Fabian Keil + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + ***************************************************************************/ + +#pragma D option quiet +#pragma D option dynvarsize=10m + +dtrace:::BEGIN +{ + traced_function = $$1; + + multiple = "microseconds"; + divisor = 1000; + executions = 0; + + this->timestamp = walltimestamp; + this->msecs = (this->timestamp / 1000000) % 1000; + printf("%Y.%.3d: Tracing '%s'. Hit CTRL-C to exit.\n", + this->timestamp, this->msecs, traced_function); +} + +fbt::$$1:entry +{ + self->timestamp = timestamp; + executions++; +} + +fbt::$$1:return +/self->timestamp/ +{ + this->elapsed = (timestamp - self->timestamp) / divisor; + @elapsed_time = llquantize(this->elapsed, 10, 0, 5, 10); + @elapsed_avg["Average"] = avg(this->elapsed); + self->elapsed = 0; +} + +tick-10sec, +END +/executions/ +{ + printf("%Y: Runtime for %s() in %s\n", walltimestamp, traced_function, multiple); + printa(@elapsed_avg); + printa(@elapsed_time); +} + +END +/executions == 0/ +{ + printf("%Y: Looks like %s() wasn't called yet.\n", walltimestamp, traced_function); +} -- 2.5.1