ZFS on Linux

From Lolly's Wiki
Jump to navigationJump to search


Grub

Create /etc/udev/rules.d/99-local-grub.rules with this content:

# Create by-id links in /dev as well for zfs vdev. Needed by grub
# Add links for zfs_member only
KERNEL=="sd*[0-9]", IMPORT{parent}=="ID_*", ENV{ID_FS_TYPE}=="zfs_member", SYMLINK+="$env{ID_BUS}-$env{ID_SERIAL}-part%n"


Virtualbox on ZVols

If you use ZVols as rawvmdk-device in VirtualBox as normal user (vmuser in this example) create /etc/udev/rules.d/99-local-zvol.rules with this content:

KERNEL=="zd*" SUBSYSTEM=="block" ACTION=="add|change" PROGRAM="/lib/udev/zvol_id /dev/%k" RESULT=="rpool/VM/*" OWNER="vmuser"
vmuser@virtualbox-server:~$ VBoxManage internalcommands createrawvmdk -filename /var/data/VMs/dev/Solaris10.vmdk -rawdisk /dev/zvol/rpool/VM/Solaris10

Setup Ubuntu 16.04 with ZFS root

Most is from here Ubuntu-16.04-Root-on-ZFS.

Boot Ubuntu Desktop (alias Live CD) and choose "try out".

Get the right ashift value

For example to get sda and sdb:

# lsblk -o NAME,PHY-SeC,LOG-SEC /dev/sd{a,b} | awk 'function exponent (value) {for(i=0;value>1;i++){value/=2;}; return i;}{if($2 ~ /[0-9]+/){print $0,exponent($2)}else{print$0,"ashift"}}'
NAME   PHY-SEC LOG-SEC ashift
sda        512     512 9
├─sda1     512     512 9
├─sda2     512     512 9
├─sda3     512     512 9
└─sda4     512     512 9
sdb       4096     512 12
├─sdb1    4096     512 12
├─sdb2    4096     512 12
├─sdb3    4096     512 12
└─sdb4    4096     512 12

Connect it to your network

sudo -i
ifconfig ens160 <IP> netmask 255.255.255.0
route add default gw <defaultrouter>

echo "nameserver <nameserver>" >> /etc/resolv.conf
echo 'Acquire::http::Proxy "http://<user>:<pass>@<proxyhost>:<proxyport>";' >> /etc/apt/apt.conf

apt-add-repository universe
apt update
apt --yes install openssh-server
passwd ubuntu

Reconnect via ssh

apt install --yes debootstrap gdisk zfs-initramfs
sgdisk -g -a1 -n2:34:2047  -t2:EF02 /dev/disk/by-id/scsi-36000c2932cdb62febff0b5ac93786dd4
sgdisk        -n9:-8M:0    -t9:BF07 /dev/disk/by-id/scsi-36000c2932cdb62febff0b5ac93786dd4
sgdisk        -n1:0:0      -t1:BF01 /dev/disk/by-id/scsi-36000c2932cdb62febff0b5ac93786dd4

zpool create -f -o ashift=12 \
      -O atime=off \
      -O canmount=off \
      -O compression=lz4 \
      -O normalization=formD \
      -O mountpoint=/ \
      -R /mnt \
      rpool /dev/disk/by-id/scsi-36000c2932cdb62febff0b5ac93786dd4-part1

zfs create -o canmount=off -o mountpoint=none rpool/ROOT
zfs create -o canmount=noauto -o mountpoint=/ rpool/ROOT/ubuntu
zfs mount rpool/ROOT/ubuntu
zfs create                 -o setuid=off              rpool/home
zfs create -o mountpoint=/root                        rpool/home/root
zfs create -o canmount=off -o setuid=off  -o exec=off rpool/var
zfs create -o com.sun:auto-snapshot=false             rpool/var/cache
zfs create                                            rpool/var/log
zfs create                                            rpool/var/spool
zfs create -o com.sun:auto-snapshot=false -o exec=on  rpool/var/tmp
zfs create -V 4G -b $(getconf PAGESIZE) -o compression=zle \
      -o logbias=throughput -o sync=always \
      -o primarycache=metadata -o secondarycache=none \
      -o com.sun:auto-snapshot=false rpool/swap

cp -p {,/mnt}/etc/apt/apt.conf
export http_proxy=$(awk '/Acquire::http::Proxy/{gsub(/\"/,"");gsub(/;$/,"");print $2}' /mnt/etc/apt/apt.conf)
echo -n xenial{,-security,-updates} | \
  xargs -n 1 -d ' ' -I{} echo "deb http://archive.ubuntu.com/ubuntu {} main universe" > /mnt/etc/apt/sources.list

chmod 1777 /mnt/var/tmp
debootstrap xenial /mnt
zfs set devices=off rpool

HOSTNAME=Template-VM
echo ${HOSTNAME} > /mnt/etc/hostname
printf "127.0.1.1\t%s\n" "${HOSTNAME}" >> /mnt/etc/hosts

INTERFACE=$(ip a s scope global | awk 'NR==1{gsub(/:$/,"",$2);print $2;}')
printf "auto %s\niface %s inet dhcp\n" "${INTERFACE}" "${INTERFACE}" > /mnt/etc/network/interfaces.d/${INTERFACE}

mount --rbind /dev  /mnt/dev
mount --rbind /proc /mnt/proc
mount --rbind /sys  /mnt/sys
cp -p {,/mnt}/etc/apt/apt.conf
echo -n xenial{,-security,-updates} | \
  xargs -n 1 -d ' ' -I{} echo "deb http://archive.ubuntu.com/ubuntu {} main universe" > /mnt/etc/apt/sources.list

chroot /mnt /bin/bash --login

locale-gen en_US.UTF-8
echo 'LANG="en_US.UTF-8"' > /etc/default/locale
LANG="en_US.UTF-8"
dpkg-reconfigure tzdata

ln -s /proc/self/mounts /etc/mtab
apt update
apt install --yes ubuntu-minimal
apt install --yes --no-install-recommends linux-image-generic
apt install --yes zfs-initramfs
apt install --yes openssh-server

apt install --yes grub-pc
addgroup --system lpadmin
addgroup --system sambashare
passwd

grub-probe /

update-initramfs -c -k all

vi /etc/default/grub
Comment out: GRUB_HIDDEN_TIMEOUT=0
Remove quiet and splash from: GRUB_CMDLINE_LINUX_DEFAULT
Uncomment: GRUB_TERMINAL=console

update-grub
grub-install /dev/disk/by-id/scsi-36000c2932cdb62febff0b5ac93786dd4

zfs snapshot rpool/ROOT/ubuntu@install

exit
mount | grep -v zfs | tac | awk '/\/mnt/ {print $3}' | xargs -i{} umount -lf {}
zpool export rpool

reboot

apt install --yes cryptsetup
echo cryptswap1 /dev/zvol/rpool/swap /dev/urandom swap,cipher=aes-xts-plain64:sha256,size=256 >> /etc/crypttab
systemctl daemon-reload
systemctl start systemd-cryptsetup@cryptswap1.service
echo /dev/mapper/cryptswap1 none swap defaults 0 0 >> /etc/fstab
swapon -av

Swap on ZFS with random key encryption

$ sudo systemctl edit --force --full zfs-cryptswap@.service
# /etc/systemd/system/zfs-cryptswap@.service
[Unit]
Description=ZFS Random Cryptography Setup for %I
Documentation=man:zfs(8)
DefaultDependencies=no
Conflicts=umount.target
IgnoreOnIsolate=true
After=systemd-random-seed.service zfs-volumes.target
BindsTo=dev-zvol-rpool-%i.device
Before=umount.target

[Service]
Type=oneshot
RemainAfterExit=yes
TimeoutSec=0
KeyringMode=shared
OOMScoreAdjust=500
UMask=0077
RuntimeDirectory=zfs-cryptswap.%i
RuntimeDirectoryMode=0700
ExecStartPre=-/sbin/swapoff '/dev/zvol/rpool/%i'
ExecStartPre=-/sbin/zfs destroy 'rpool/%i'
ExecStartPre=/bin/dd if=/dev/urandom of=/run/zfs-cryptswap.%i/%i.key bs=32 count=1
ExecStart=/sbin/zfs create -V 4G -b 8k -o compression=zle -o logbias=throughput -o sync=always -o primarycache=metadata -o secondarycache=none -o com.sun:auto-snapshot=false -o encryption=on -o keyformat=raw -o keylocation=file:///run/zfs-cryptswap.%i/%i.key rpool/%i
ExecStart=/bin/sleep 1
ExecStartPost=/sbin/mkswap '/dev/zvol/rpool/%i'
ExecStartPost=/sbin/swapon '/dev/zvol/rpool/%i'
ExecStop=/sbin/swapoff '/dev/zvol/rpool/%i'
ExecStop=/bin/sleep 2
ExecStopPost=/sbin/zfs destroy 'rpool/%i'

[Install]
WantedBy=swap.target

!!!BE CAREFUL with the name after @ !!!

The name after the @ is the name of the ZFS that will be DESTROYED and recreated!!!

To destroy and recreate an encrypted ZFS volume named cryptswap use:

# systemctl start  zfs-cryptswap@cryptswap.service
# systemctl enable zfs-cryptswap@cryptswap.service
# update-initramfs -k $(uname -i) -u

Kernel settings for ZFS

Set module parameter in /etc/modprobe.d/zfs.conf

options zfs zfs_arc_max=10737418240

# increase them so scrub/resilver is more quickly at the cost of other work
options zfs zfs_vdev_scrub_min_active=24
options zfs zfs_vdev_scrub_max_active=64
# sync write
options zfs zfs_vdev_sync_write_min_active=8
options zfs zfs_vdev_sync_write_max_active=32
# sync reads (normal)
options zfs zfs_vdev_sync_read_min_active=8
options zfs zfs_vdev_sync_read_max_active=32
# async reads : prefetcher
options zfs zfs_vdev_async_read_min_active=8
options zfs zfs_vdev_async_read_max_active=32
# async write : bulk writes
options zfs zfs_vdev_async_write_min_active=8
options zfs zfs_vdev_async_write_max_active=32

# max write speed to l2arc
# tradeoff between write/read and durability of ssd (?)
# default : 8 * 1024 * 1024
# setting here : 500 * 1024 * 1024
options zfs l2arc_write_max=524288000

options zfs zfs_top_maxinflight=512
options zfs zfs_resilver_min_time_ms=8000
options zfs zfs_resilver_delay=0

Remember to update your initramfs before boot. This is the filesystem which is read when your module is loaded.

# update-initramfs -k all -u

Check settings

root@zfshost:~# modprobe -c | grep "options zfs"
options zfs zfs_arc_max=10737418240
options zfs zfs_vdev_scrub_min_active=24
options zfs zfs_vdev_scrub_max_active=64
options zfs zfs_vdev_sync_write_min_active=8
options zfs zfs_vdev_sync_write_max_active=32
options zfs zfs_vdev_sync_read_min_active=8
options zfs zfs_vdev_sync_read_max_active=32
options zfs zfs_vdev_async_read_min_active=8
options zfs zfs_vdev_async_read_max_active=32
options zfs zfs_vdev_async_write_min_active=8
options zfs zfs_vdev_async_write_max_active=32
options zfs l2arc_write_max=524288000
options zfs zfs_top_maxinflight=512
options zfs zfs_resilver_min_time_ms=8000
options zfs zfs_resilver_delay=0
root@zfshost:~# modprobe --show-depends zfs
insmod /lib/modules/4.15.0-58-generic/kernel/spl/spl.ko 
insmod /lib/modules/4.15.0-58-generic/kernel/zfs/znvpair.ko 
insmod /lib/modules/4.15.0-58-generic/kernel/zfs/zcommon.ko 
insmod /lib/modules/4.15.0-58-generic/kernel/zfs/icp.ko 
insmod /lib/modules/4.15.0-58-generic/kernel/zfs/zavl.ko 
insmod /lib/modules/4.15.0-58-generic/kernel/zfs/zunicode.ko 
insmod /lib/modules/4.15.0-58-generic/kernel/zfs/zfs.ko zfs_arc_max=10737418240 zfs_vdev_scrub_min_active=24 zfs_vdev_scrub_max_active=64 zfs_vdev_sync_write_min_active=8 zfs_vdev_sync_write_max_active=32 zfs_vdev_sync_read_min_active=8 zfs_vdev_sync_read_max_active=32 zfs_vdev_async_read_min_active=8 zfs_vdev_async_read_max_active=32 zfs_vdev_async_write_min_active=8 zfs_vdev_async_write_max_active=32 l2arc_write_max=524288000 zfs_top_maxinflight=512 zfs_resilver_min_time_ms=8000 zfs_resilver_delay=0

Check actual settings

Check files in

  • /proc/spl/kstat/zfs/
  • /sys/module/zfs/parameters/

ARC Cache

Get the current usage of cache

# cat /proc/spl/kstat/zfs/arcstats |grep c_
c_min                           4    521779200
c_max                           4    1073741824
arc_no_grow                     4    0
arc_tempreserve                 4    0
arc_loaned_bytes                4    0
arc_prune                       4    25360
arc_meta_used                   4    493285336
arc_meta_limit                  4    805306368
arc_dnode_limit                 4    80530636
arc_meta_max                    4    706551816
arc_meta_min                    4    16777216
sync_wait_for_async             4    357
arc_need_free                   4    0
arc_sys_free                    4    260889600

Limit the cache without reboot non permanent

For example limit it to 512MB (which is too small for production environments, just an example...):

# echo "$[512*1024*1024]" > /sys/module/zfs/parameters/zfs_arc_max

Now you have to drop the caches:

# echo 3 > /proc/sys/vm/drop_caches

Make the cache limit permanent

For example limit it to 512MB (which is too small for production environments, just an example...):

# echo "options zfs zfs_arc_max=$[512*1024*1024]" >> /etc/modprobe.d/zfs.conf

After reboot this value take effect.

Check cache hits/misses

# (while : ; do cat /proc/spl/kstat/zfs/arcstats ; sleep 5 ; done ) | awk '
          BEGIN { 
          }     
          $1 ~ /(hits|misses)/ {
                  name=$1;
                  gsub(/[_]*(hits|misses)/,"",name);
                  if(name == ""){ 
                    name="global";
                  }
          }
          $1 ~ /hits/ {
                  hits[name] = $3 - hitslast[name]
                  hitslast[name] = $3
          }     
          $1 ~ /misses/ {
                  misses[name] = $3 - misslast[name]
                  misslast[name] = $3
                  rate = 0
                  total = hits[name] + misses[name]
                  if (total)
                          rate = (hits[name] * 100) / total
                  if (name=="global")
                    printf "%30s %12s %12s %9s\n", "NAME", "HITS", "MISSES", "HITRATE"

                  printf "%30s %12d %12d %8.2f%%\n", name, hits[name], misses[name], rate
          }     
  '

Higher scrub performance

#!/bin/bash

#
## scrub_fast.sh
#

case $1 in
start)
  echo    0 > /sys/module/zfs/parameters/zfs_scan_idle
  echo    0 > /sys/module/zfs/parameters/zfs_scrub_delay
  echo  512 > /sys/module/zfs/parameters/zfs_top_maxinflight
  echo 5000 > /sys/module/zfs/parameters/zfs_scan_min_time_ms
  echo    4 > /sys/module/zfs/parameters/zfs_vdev_scrub_min_active
  echo    8 > /sys/module/zfs/parameters/zfs_vdev_scrub_max_active
  ;;
stop)
  echo   50 > /sys/module/zfs/parameters/zfs_scan_idle
  echo    4 > /sys/module/zfs/parameters/zfs_scrub_delay
  echo   32 > /sys/module/zfs/parameters/zfs_top_maxinflight
  echo 1000 > /sys/module/zfs/parameters/zfs_scan_min_time_ms
  echo    1 > /sys/module/zfs/parameters/zfs_vdev_scrub_min_active
  echo    2 > /sys/module/zfs/parameters/zfs_vdev_scrub_max_active
  ;;
status)
  for i in zfs_scan_idle zfs_scrub_delay zfs_top_maxinflight zfs_scan_min_time_ms zfs_vdev_scrub_{min,max}_active
  do
    param="/sys/module/zfs/parameters/${i}"
    printf "%60s\t%d\n" "${param}" "$(cat ${param})"
  done
  ;;
*)
  echo "Usage: ${0} (start|stop|status)"
  ;;
esac

More information on zpool status

#!/bin/bash

#
## print_zpool.sh
#

# Written by Lars Timmann <L@rs.Timmann.de> 2022

columns=5 # number of columns for zpool status
if [ ${#} -gt 0 ] && [ ${1} == "iostat" ]
then
  command="iostat -v"
  columns=7
  shift
fi

stdbuf --output=L zpool ${command:-status} -P ${*} | awk -v columns=${columns} '
BEGIN {
  command="lsscsi --scsi_id";
  while( command | getline lsscsi ) {
    count=split(lsscsi,fields);
    dev=fields[count-1];
    scsi_id[dev]=fields[1];
  }
  close(command);
  
  command="ls -Ul /dev/disk/by-id/*";
  while( command | getline ) {
    dev=$NF;
    gsub(/[\.\/]/,"",dev);
    dev_id=$(NF-2);
    device[dev_id]="/dev/"dev;
  }
  close(command);
}
$1 ~ /\/dev\// {
  line=$0;
  dev_by_id=$1;
  dev_no_part=dev_by_id;
  gsub(/(-part|)[0-9]+$/,"",dev_no_part);
  if( NF > 5) {
    count=split(line,a,FS,seps);
    line=seps[0];
    for(i=1;i<columns;i++){
      line=line a[i] seps[i];
    }
    line=line a[columns];
    for(i=columns+1;i<=count;i++){
      rest=rest a[i] seps[i];
    }
  }
  printf("%s %s %s",line,scsi_id[device[dev_no_part]],device[dev_by_id]);
  if(rest!=""){
    printf(" %s",rest);
    rest="";
  }
  printf("\n");
  next;
}
/^errors:/ {
  print;
  fflush();
  next;
}
{ 
  print;
}'

Backup ZFS settings

A little script which may be used on your own risk.

#!/bin/bash

# Written by Lars Timmann <L@rs.Timmann.de> 2018
# Tested on solaris 11.3 & Ubuntu Linux 

# This script is a rotten bunch of code... rewrite it!

AWK_CMD=/usr/bin/gawk
ZPOOL_CMD=/sbin/zpool
ZFS_CMD=/sbin/zfs
ZDB_CMD=/sbin/zdb

function print_local_options () {
  DATASET=$1
  OPTION=$2
  EXCLUDE_REGEX=$3
  ${ZFS_CMD} get -s local -Ho property,value -p ${OPTION} ${DATASET} | while read -r property value
  do
    if [[ ! ${property} =~ ${EXCLUDE_REGEX} ]]
    then
      if [ "_${property}_" == "_share.*_" ]
      then
        print_local_options "${DATASET}" 'share.all' '^$'
      else
        printf '\t-o %s=%s \\\n' "${property}" "${value}"
      fi
    fi
  done
}

function print_filesystem () {
  ZFS=$1

  printf '%s create \\\n' "${ZFS_CMD}"
  print_local_options "${ZFS}" 'all' '^$'
  printf '\t%s\n' "${ZFS}"
}

function print_filesystems () {
  ZPOOL=$1
  for ZFS in $(${ZFS_CMD} list -Ho name -t filesystem -r ${ZPOOL})
  do
    if [ ${ZFS} == ${ZPOOL} ] ; then continue ; fi
    printf '#\n## Filesystem: %s\n#\n\n' "${ZFS}"
    print_filesystem ${ZFS}
    printf '\n'
  done
}

function print_volume () {
  ZVOL=$1
  volsize=$(${ZFS_CMD} get -Ho value volsize ${ZVOL})
  volblocksize=$(${ZFS_CMD} get -Ho value volblocksize ${ZVOL})
  
  printf '%s create \\\n\t-V %s \\\n\t-b %s \\\n' "${ZFS_CMD}" "${volsize}" "${volblocksize}"
  print_local_options "${ZVOL}" 'all' '(volsize|refreservation)'
  printf '\t%s\n' "${ZVOL}"
}

function print_volumes () {
  ZPOOL=$1
  for ZVOL in $(${ZFS_CMD} list -Ho name -t volume -r ${ZPOOL})
  do
    printf '#\n## Volume: %s\n#\n\n' "${ZVOL}"
    print_volume ${ZVOL}
    printf '\n'
  done
}

function print_vdevs () {
  ZPOOL=$1
  ${ZDB_CMD} -C ${ZPOOL} | ${AWK_CMD} -F':' '
    $1 ~ /^[[:space:]]*type$/ {
      gsub(/[ ]+/,"",$NF);
      type=substr($NF,2,length($NF)-2);
      if ( type == "mirror" ) {
        printf " \\\n\t%s",type;
      }
    }
    $1 ~ /^[[:space:]]*path$/ {
      gsub(/[ ]+/,"",$NF);
      vdev=substr($NF,2,length($NF)-2);
      printf " \\\n\t%s",vdev;
    }
    END {
      printf "\n";
    }
  '
}

function print_zpool () {
  ZPOOL=$1
  
  printf '#############################################################\n'
  printf '#\n## ZPool: %s\n#\n' "${ZPOOL}"
  printf '#############################################################\n\n'

  printf '%s create \\\n' "${ZPOOL_CMD}"
  print_local_options "${ZPOOL}" 'all' '/@/'
  printf '\t%s' "${ZPOOL}"
  print_vdevs "${ZPOOL}"
  printf '\n'

  printf '#############################################################\n\n'
  print_filesystems   "${ZPOOL}"
  print_volumes       "${ZPOOL}"
}

OS=$(uname -s)
eval $(uname -s)=1
HOSTNAME=$(hostname)

printf '#############################################################\n'
printf '# Hostname: %s\n' "${HOSTNAME}"
printf '#############################################################\n\n'
for ZPOOL in $(${ZPOOL_CMD} list -Ho name)
do
  print_zpool ${ZPOOL}
done

Links