From fad1f6f24a6ff2e641d6089a489d53dad6a3c8ea Mon Sep 17 00:00:00 2001 From: Fabian Keil Date: Thu, 12 May 2016 14:52:16 +0200 Subject: [PATCH] GELI: Use a dedicated uma zone for writes to onetime devices ... as they are likely to originate from the vm page daemon. Previously the system could deadlock because the vm daemon was waiting for pages to be written to disk, while GELI was waiting for the vm daemon to make room for the buffer GELI needed to actually write the pages: (kgdb) where #0 sched_switch (td=0xfffff800055bf9a0, newtd=0xfffff80002341000, flags=) at /usr/src/sys/kern/sched_ule.c:1969 #1 0xffffffff80962635 in mi_switch (flags=, newtd=0x0) at /usr/src/sys/kern/kern_synch.c:455 #2 0xffffffff809aaa3a in sleepq_wait (wchan=0x0, pri=0) at /usr/src/sys/kern/subr_sleepqueue.c:637 #3 0xffffffff80962038 in _sleep (ident=, lock=, priority=, wmesg=0xffffffff80e826ee "vmwait", sbt=0, pr=, flags=) at /usr/src/sys/kern/kern_synch.c:229 #4 0xffffffff80c1ac6b in vm_wait () at /usr/src/sys/vm/vm_page.c:2705 #5 0xffffffff80c06a9f in kmem_back (object=0xffffffff8144d6f0, addr=18446741874805047296, size=69632, flags=) at /usr/src/sys/vm/vm_kern.c:356 #6 0xffffffff80c068d2 in kmem_malloc (vmem=0xffffffff813aa500, size=69632, flags=2) at /usr/src/sys/vm/vm_kern.c:316 #7 0xffffffff80bfd7d6 in uma_large_malloc (size=69632, wait=2) at /usr/src/sys/vm/uma_core.c:1106 #8 0xffffffff8092f614 in malloc (size=, mtp=0xffffffff81b4d520, flags=0) at /usr/src/sys/kern/kern_malloc.c:513 #9 0xffffffff81b4ab99 in g_eli_crypto_run (wr=0xfffff80002560040, bp=0xfffff80008a86d90) at /usr/src/sys/modules/geom/geom_eli/../../../geom/eli/g_eli_privacy.c:262 #10 0xffffffff81b3e860 in g_eli_worker (arg=0xfffff80002560040) at /usr/src/sys/modules/geom/geom_eli/../../../geom/eli/g_eli.c:565 #11 0xffffffff80910f5c in fork_exit (callout=0xffffffff81b3e0b0 , arg=0xfffff80002560040, frame=0xfffffe005005ec00) at /usr/src/sys/kern/kern_fork.c:1034 #12 0xffffffff80c33f0e in fork_trampoline () at /usr/src/sys/amd64/amd64/exception.S:611 #13 0x0000000000000000 in ?? () (kgdb) p vm_cnt $16 = {v_swtch = 0, v_trap = 0, v_syscall = 0, v_intr = 0, v_soft = 0, v_vm_faults = 0, v_io_faults = 0, v_cow_faults = 0, v_cow_optim = 0, v_zfod = 0, v_ozfod = 0, v_swapin = 0, v_swapout = 0, v_swappgsin = 0, v_swappgsout = 0, v_vnodein = 0, v_vnodeout = 0, v_vnodepgsin = 0, v_vnodepgsout = 0, v_intrans = 0, v_reactivated = 0, v_pdwakeups = 22197, v_pdpages = 0, v_tcached = 0, v_dfree = 0, v_pfree = 0, v_tfree = 0, v_page_size = 4096, v_page_count = 247688, v_free_reserved = 372, v_free_target = 5320, v_free_min = 1609, v_free_count = 2, v_wire_count = 140735, v_active_count = 96194, v_inactive_target = 7980, v_inactive_count = 10756, v_cache_count = 0, v_pageout_free_min = 34, v_interrupt_free_min = 2, v_free_severe = 990, v_forks = 0, v_vforks = 0, v_rforks = 0, v_kthreads = 0, v_forkpages = 0, v_vforkpages = 0, v_rforkpages = 0, v_kthreadpages = 0, v_spare = 0xffffffff8144d5ac} A sysctl is added to optionally use the zone for GELI writes in general, without letting common writes cut into the reserve for onetime writes. This may reduce latency for larger writes and as we need to keep a couple of items in the zone anyway, the impact on the zone size is minor. Initial testing seems to indicate that the sysctl could be safely enabled by default in the future. Currently a single zone with a somewhat humongous item size sufficient for all GELI writes is being used. While this may look a bit wasteful, in practice we don't need a lot of items, so this seem tolerable for now. The best solution would probably be to only use the dedicated uma zone for common writes if the size is above 65356 bytes, the largest zone item size internally used by malloc. Currently the zone isn't used for reads as those are less time critical and usually are small enough for malloc() to succeed right away anyway. Example length distribution when reproducing ElectroBSD with -j4 and 1 GB of RAM: gpt/swap-ada1.eli BIO_WRITE value ------------- Distribution ------------- count < 4096 | 0 4096 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 4965848 8192 |@@@@@ 943980 12288 |@@ 362668 16384 |@ 161485 20480 |@ 120939 24576 | 87827 28672 | 57402 32768 | 40470 36864 | 42243 40960 | 28543 45056 | 20347 49152 | 15235 53248 | 13450 57344 | 9535 61440 | 9952 65536 |@ 179360 69632 | 0 gpt/swap-ada1.eli BIO_READ value ------------- Distribution ------------- count < 4096 | 0 4096 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 4645114 8192 | 0 12288 | 0 16384 | 3446 20480 | 0 Note that the GELI overhead is not accounted for here and only the results for the swap device are shown. Zone use: [fk@elektrobier3 ~]$ vmstat -z | egrep 'ITEM|eli' | column -t ITEM SIZE LIMIT USED FREE REQ FAIL SLEEP g_eli: 172032, 0, 0, 14, 8077487, 0, 0 This includes writes to gpt/dpool-ada1.eli and gpt/dpool-ada1.eli. Discussion: While the zone served 8077487 memory requests total, 14 items were sufficient for this and therefore the zone only withheld 172032 * 14 bytes plus zone meta data from the rest of the system. Obtained from: ElectroBSD --- sys/geom/eli/g_eli.c | 26 ++++++++++++++++++++++++-- sys/geom/eli/g_eli.h | 18 ++++++++++++++++++ sys/geom/eli/g_eli_privacy.c | 35 +++++++++++++++++++++++++++++++++-- 3 files changed, 75 insertions(+), 4 deletions(-) diff --git a/sys/geom/eli/g_eli.c b/sys/geom/eli/g_eli.c index 9add130..4c6309c 100644 --- a/sys/geom/eli/g_eli.c +++ b/sys/geom/eli/g_eli.c @@ -82,6 +82,17 @@ u_int g_eli_batch = 0; SYSCTL_UINT(_kern_geom_eli, OID_AUTO, batch, CTLFLAG_RWTUN, &g_eli_batch, 0, "Use crypto operations batching"); +uma_zone_t g_eli_zone; +static u_int g_eli_uma_reserve = 1; +SYSCTL_UINT(_kern_geom_eli, OID_AUTO, uma_reserve, CTLFLAG_RDTUN, + &g_eli_uma_reserve, 0, "Items to pre-allocate in dedicated uma zone " + "and reserve for writes to onetime disks"); + +u_int g_eli_all_writes_use_uma = 0; +SYSCTL_UINT(_kern_geom_eli, OID_AUTO, use_uma_for_all_writes, CTLFLAG_RDTUN, + &g_eli_all_writes_use_uma, 0, "Use the dedicated uma zone for all writes. " + "May reduce write latency but also inflates memory use a bit"); + /* * Passphrase cached during boot, in order to be more user-friendly if * there are multiple providers using the same passphrase. @@ -246,7 +257,12 @@ g_eli_write_done(struct bio *bp) pbp->bio_inbed++; if (pbp->bio_inbed < pbp->bio_children) return; - free(pbp->bio_driver2, M_ELI); + sc = pbp->bio_to->geom->softc; + if (g_eli_all_writes_use_uma || + (sc->sc_flags & G_ELI_FLAG_ONETIME) != 0) + uma_zfree(g_eli_zone, pbp->bio_driver2); + else + free(pbp->bio_driver2, M_ELI); pbp->bio_driver2 = NULL; if (pbp->bio_error != 0) { G_ELI_LOGREQ(0, pbp, "%s() failed (error=%d)", __func__, @@ -258,7 +274,6 @@ g_eli_write_done(struct bio *bp) /* * Write is finished, send it up. */ - sc = pbp->bio_to->geom->softc; g_io_deliver(pbp, pbp->bio_error); if (sc != NULL) atomic_subtract_int(&sc->sc_inflight, 1); @@ -1254,6 +1269,12 @@ static void g_eli_init(struct g_class *mp) { + g_eli_zone = uma_zcreate("g_eli", ELI_ZONE_ITEM_SIZE, NULL, NULL, + NULL, NULL, 0, UMA_ZONE_NOFREE); + /* Increase the chances that items are available when needed. */ + uma_prealloc(g_eli_zone, g_eli_uma_reserve); + uma_zone_reserve(g_eli_zone, g_eli_uma_reserve); + g_eli_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync, g_eli_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST); if (g_eli_pre_sync == NULL) @@ -1264,6 +1285,7 @@ static void g_eli_fini(struct g_class *mp) { + uma_zdestroy(g_eli_zone); if (g_eli_pre_sync != NULL) EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_eli_pre_sync); } diff --git a/sys/geom/eli/g_eli.h b/sys/geom/eli/g_eli.h index 680f673..4119348 100644 --- a/sys/geom/eli/g_eli.h +++ b/sys/geom/eli/g_eli.h @@ -139,6 +139,24 @@ #define G_ELI_CRYPTO_SW 2 #ifdef _KERNEL +/* + * Items in the dedicated uma zone have a fixed size and need + * to be big enough for all write lengths. + * + * MAXPHYS is the largest amount of data geli can receive in a row, + * additionally we have to account for the encryption overhead, which + * depends on the number of sectors. + * + * 512 bytes is the smallest sector size supported and results in the + * largest overhead. If larger sectors are being used, we'll just waste + * a bit more memory. + * + * Given that the zone does not need a lot of items, the generous + * item size seems tolerable for now. + */ +#define ELI_ZONE_ITEM_SIZE (MAXPHYS + (MAXPHYS / 512) * \ + (sizeof(struct cryptop) + sizeof(struct cryptodesc))) + extern int g_eli_debug; extern u_int g_eli_overwrites; extern u_int g_eli_batch; diff --git a/sys/geom/eli/g_eli_privacy.c b/sys/geom/eli/g_eli_privacy.c index 6ed5846..282b886 100644 --- a/sys/geom/eli/g_eli_privacy.c +++ b/sys/geom/eli/g_eli_privacy.c @@ -49,6 +49,9 @@ __FBSDID("$FreeBSD$"); #include #include +extern u_int g_eli_all_writes_use_uma; +extern uma_zone_t g_eli_zone; + /* * Code paths: * BIO_READ: @@ -153,7 +156,11 @@ g_eli_crypto_write_done(struct cryptop *crp) if (bp->bio_error != 0) { G_ELI_LOGREQ(0, bp, "Crypto WRITE request failed (error=%d).", bp->bio_error); - free(bp->bio_driver2, M_ELI); + if (g_eli_all_writes_use_uma || + (sc->sc_flags & G_ELI_FLAG_ONETIME) != 0) + uma_zfree(g_eli_zone, bp->bio_driver2); + else + free(bp->bio_driver2, M_ELI); bp->bio_driver2 = NULL; g_destroy_bio(cbp); g_io_deliver(bp, bp->bio_error); @@ -259,8 +266,32 @@ g_eli_crypto_run(struct g_eli_worker *wr, struct bio *bp) */ if (bp->bio_cmd == BIO_WRITE) size += bp->bio_length; - p = malloc(size, M_ELI, M_WAITOK); + if (bp->bio_cmd == BIO_WRITE && + (((sc->sc_flags & G_ELI_FLAG_ONETIME) != 0) || + g_eli_all_writes_use_uma)) { + int uma_flags; + + KASSERT(size <= ELI_ZONE_ITEM_SIZE, + ("Insufficient ELI_ZONE_ITEM_SIZE %u < %u", + (unsigned)ELI_ZONE_ITEM_SIZE, (unsigned)size)); + /* + * Writes to onetime providers are likely to originate + * from the page daemon, therefore we try to get the + * memory a bit harder for them to prevent vm deadlocks. + */ + if ((sc->sc_flags & G_ELI_FLAG_ONETIME) != 0) + uma_flags = M_NOWAIT|M_USE_RESERVE; + else + uma_flags = M_WAITOK; + + while (NULL == (p = uma_zalloc(g_eli_zone, uma_flags))) { + /* Only reachable for onetime providers */ + pause("g_eli:uma", min(hz/1000, 1)); + } + } else { + p = malloc(size, M_ELI, M_WAITOK); + } bp->bio_inbed = 0; bp->bio_children = nsec; bp->bio_driver2 = p; -- 2.7.4