From 2b469d555267cf3773db719b1f834791c4f19086 Mon Sep 17 00:00:00 2001 From: Fabian Keil Date: Wed, 13 Sep 2017 04:31:10 +0200 Subject: [PATCH] ZFS: Add vfs.zfs.risk_data_corruption sysctl ... and don't split large blocks while sending unless the sysctl is set. Splitting large blocks can lead to silent data corruption when receiving incremental streams with a block size change on the receiving side. Example corruption: [fk@test-vm ~]$ hd /usr/test/src/sbin/camcontrol/camcontrol.c 00000000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 |................| * 0003ea10 [fk@test-vm ~]$ sudo zdb -ddddddd -bbbbbbb rpool/test/src $(stat -f '%i' /usr/test/src/sbin/camcontrol/camcontrol.c) Dataset rpool/test/src [ZPL], ID 550, cr_txg 67575, 704M, 78457 objects, rootbp DVA[0]=<0:254bfb000:1000> DVA[1]=<0:3803b7000:1000> [L0 DMU objset] sha256 uncompressed LE contiguous unique double size=800L/800P birth=67755L/67755P fill=78457 cksum=fb0fd08b71ffb3f8:e14c2e477498cdc4:4822440c7db2cccc:9fa4961bce0031b6 Object lvl iblk dblk dsize lsize %full type 100844 2 128K 251K 0 251K 0.00 ZFS plain file (K=inherit) (Z=inherit) 168 bonus System attributes dnode flags: USED_BYTES USERUSED_ACCOUNTED dnode maxblkid: 0 path /sbin/camcontrol/camcontrol.c uid 1001 gid 1001 atime Sat Jul 8 18:41:59 2017 mtime Sat Jul 8 18:41:59 2017 ctime Sat Jul 8 18:41:59 2017 crtime Sat Jul 8 18:41:59 2017 gen 609632 mode 100644 size 256529 parent 44643 links 1 pflags 40800000004 Indirect blocks: 0 L1 HOLE [L1 ZFS plain file] size=20000L birth=67755L segment [0000000000000000, 000000000003ec00) size 251K Note that the silent data corruption doesn't necessarily occur right away. For example splitting large blocks is fine when sending to a dataset with recordsize<=128k, but if the recordsize is later on increased to 1m the receiving of a incremental snapshot with large blocks (that haven't been split) can cause corruption on the receiving side as well. Never splitting large block at all avoids the (known and reproducible) problems. In practice it's a minor inconvenience and certainly less inconvenient than losing data. The zfs error message could be improved, though: # zfs send -i testpool/source@snap1 testpool/source@snap2 >/dev/null warning: cannot send 'testpool/source@snap2': unsupported version or feature # zfs send -L -i testpool/source@snap1 testpool/source@snap2 >/dev/null zfs-dbgmsg will provide the details of course: 2017 Sep 13 14:51:48: do_dump(): Aborting send as it would require splitting \ of large blocks which can cause silent data corruption. Set vfs.zfs.risk_data_corruption=1 \ to allow this or send with -L flag. See also: https://lists.freebsd.org/pipermail/freebsd-fs/2017-August/025110.html https://github.com/zfsonlinux/zfs/issues/6224 https://www.fabiankeil.de/tmp/reproduce-zfsonlinux-bug-6224.sh Obtained from: ElectroBSD --- .../opensolaris/uts/common/fs/zfs/dmu_send.c | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c index 47bb3f6428b5..c6e10c693cb1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c @@ -73,6 +73,9 @@ int zfs_recv_queue_length = 16 * 1024 * 1024; /* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */ int zfs_send_set_freerecords_bit = B_TRUE; +/* Set this tunable to 1 to enable code paths that are known to cause data corruption */ +int zfs_risk_data_corruption = 0; + #ifdef _KERNEL SYSCTL_DECL(_vfs_zfs); SYSCTL_INT(_vfs_zfs, OID_AUTO, send_corrupt_data, CTLFLAG_RWTUN, @@ -80,6 +83,9 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, send_corrupt_data, CTLFLAG_RWTUN, "Blocks filled with 0x'zfs badd bloc' are send as replacement blocks."); SYSCTL_INT(_vfs_zfs, OID_AUTO, send_set_freerecords_bit, CTLFLAG_RWTUN, &zfs_send_set_freerecords_bit, 0, "Set the freerecords bits when sending"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, risk_data_corruption, CTLFLAG_RWTUN, + &zfs_risk_data_corruption, 0, + "Enable code paths that are known to lead to data corruption under the right circumstances"); #endif static char *dmu_recv_tag = "dmu_recv_tag"; @@ -707,6 +713,25 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) */ boolean_t split_large_blocks = blksz > SPA_OLD_MAXBLOCKSIZE && !(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS); + + /* + * Splitting large blocks can cause silent data corruption + * so we only "support" this if the user wants to risk it. + */ + if (split_large_blocks && !zfs_risk_data_corruption) { + static const char msg[] = "Aborting send as it would " + "require splitting of large blocks which can " + "cause silent data corruption. Set " + "vfs.zfs.risk_data_corruption=1 to allow this or" + "send with -L flag (if the receiver supports it)."; +#ifdef _KERNEL + zfs_dbgmsg("%s: %s", __func__, msg); +#else + fprintf(stderr, "%s: %s\n", __func__, msg); +#endif + return (SET_ERROR(ENOTSUP)); + } + /* * We should only request compressed data from the ARC if all * the following are true: -- 2.32.0