ZFS on Linux

From Lolly's Wiki
Jump to navigationJump to search
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.


Grub

Create /etc/udev/rules.d/99-local-grub.rules with this content:

# Create by-id links in /dev as well for zfs vdev. Needed by grub
# Add links for zfs_member only
KERNEL=="sd*[0-9]", IMPORT{parent}=="ID_*", ENV{ID_FS_TYPE}=="zfs_member", SYMLINK+="$env{ID_BUS}-$env{ID_SERIAL}-part%n"


Virtualbox on ZVols

If you use ZVols as rawvmdk-device in VirtualBox as normal user (vmuser in this example) create /etc/udev/rules.d/99-local-zvol.rules with this content:

KERNEL=="zd*" SUBSYSTEM=="block" ACTION=="add|change" PROGRAM="/lib/udev/zvol_id /dev/%k" RESULT=="rpool/VM/*" OWNER="vmuser"
vmuser@virtualbox-server:~$ VBoxManage internalcommands createrawvmdk -filename /var/data/VMs/dev/Solaris10.vmdk -rawdisk /dev/zvol/rpool/VM/Solaris10

Setup Ubuntu 16.04 with ZFS root

Most is from here Ubuntu-16.04-Root-on-ZFS.

Boot Ubuntu Desktop (alias Live CD) and choose "try out".

Get the right ashift value

For example to get sda and sdb:

# lsblk -o NAME,PHY-SeC,LOG-SEC /dev/sd{a,b} | awk 'function exponent (value) {for(i=0;value>1;i++){value/=2;}; return i;}{if($2 ~ /[0-9]+/){print $0,exponent($2)}else{print$0,"ashift"}}'
NAME   PHY-SEC LOG-SEC ashift
sda        512     512 9
├─sda1     512     512 9
├─sda2     512     512 9
├─sda3     512     512 9
└─sda4     512     512 9
sdb       4096     512 12
├─sdb1    4096     512 12
├─sdb2    4096     512 12
├─sdb3    4096     512 12
└─sdb4    4096     512 12

Connect it to your network

sudo -i
ifconfig ens160 <IP> netmask 255.255.255.0
route add default gw <defaultrouter>

echo "nameserver <nameserver>" >> /etc/resolv.conf
echo 'Acquire::http::Proxy "http://<user>:<pass>@<proxyhost>:<proxyport>";' >> /etc/apt/apt.conf

apt-add-repository universe
apt update
apt --yes install openssh-server
passwd ubuntu

Reconnect via ssh

apt install --yes debootstrap gdisk zfs-initramfs
sgdisk -g -a1 -n2:34:2047  -t2:EF02 /dev/disk/by-id/scsi-36000c2932cdb62febff0b5ac93786dd4
sgdisk        -n9:-8M:0    -t9:BF07 /dev/disk/by-id/scsi-36000c2932cdb62febff0b5ac93786dd4
sgdisk        -n1:0:0      -t1:BF01 /dev/disk/by-id/scsi-36000c2932cdb62febff0b5ac93786dd4

zpool create -f -o ashift=12 \
      -O atime=off \
      -O canmount=off \
      -O compression=lz4 \
      -O normalization=formD \
      -O mountpoint=/ \
      -R /mnt \
      rpool /dev/disk/by-id/scsi-36000c2932cdb62febff0b5ac93786dd4-part1

zfs create -o canmount=off -o mountpoint=none rpool/ROOT
zfs create -o canmount=noauto -o mountpoint=/ rpool/ROOT/ubuntu
zfs mount rpool/ROOT/ubuntu
zfs create                 -o setuid=off              rpool/home
zfs create -o mountpoint=/root                        rpool/home/root
zfs create -o canmount=off -o setuid=off  -o exec=off rpool/var
zfs create -o com.sun:auto-snapshot=false             rpool/var/cache
zfs create                                            rpool/var/log
zfs create                                            rpool/var/spool
zfs create -o com.sun:auto-snapshot=false -o exec=on  rpool/var/tmp
zfs create -V 4G -b $(getconf PAGESIZE) -o compression=zle \
      -o logbias=throughput -o sync=always \
      -o primarycache=metadata -o secondarycache=none \
      -o com.sun:auto-snapshot=false rpool/swap

cp -p {,/mnt}/etc/apt/apt.conf
export http_proxy=$(awk '/Acquire::http::Proxy/{gsub(/\"/,"");gsub(/;$/,"");print $2}' /mnt/etc/apt/apt.conf)
echo -n xenial{,-security,-updates} | \
  xargs -n 1 -d ' ' -I{} echo "deb http://archive.ubuntu.com/ubuntu {} main universe" > /mnt/etc/apt/sources.list

chmod 1777 /mnt/var/tmp
debootstrap xenial /mnt
zfs set devices=off rpool

HOSTNAME=Template-VM
echo ${HOSTNAME} > /mnt/etc/hostname
printf "127.0.1.1\t%s\n" "${HOSTNAME}" >> /mnt/etc/hosts

INTERFACE=$(ip a s scope global | awk 'NR==1{gsub(/:$/,"",$2);print $2;}')
printf "auto %s\niface %s inet dhcp\n" "${INTERFACE}" "${INTERFACE}" > /mnt/etc/network/interfaces.d/${INTERFACE}

mount --rbind /dev  /mnt/dev
mount --rbind /proc /mnt/proc
mount --rbind /sys  /mnt/sys
cp -p {,/mnt}/etc/apt/apt.conf
echo -n xenial{,-security,-updates} | \
  xargs -n 1 -d ' ' -I{} echo "deb http://archive.ubuntu.com/ubuntu {} main universe" > /mnt/etc/apt/sources.list

chroot /mnt /bin/bash --login

locale-gen en_US.UTF-8
echo 'LANG="en_US.UTF-8"' > /etc/default/locale
LANG="en_US.UTF-8"
dpkg-reconfigure tzdata

ln -s /proc/self/mounts /etc/mtab
apt update
apt install --yes ubuntu-minimal
apt install --yes --no-install-recommends linux-image-generic
apt install --yes zfs-initramfs
apt install --yes openssh-server

apt install --yes grub-pc
addgroup --system lpadmin
addgroup --system sambashare
passwd

grub-probe /

update-initramfs -c -k all

vi /etc/default/grub
Comment out: GRUB_HIDDEN_TIMEOUT=0
Remove quiet and splash from: GRUB_CMDLINE_LINUX_DEFAULT
Uncomment: GRUB_TERMINAL=console

update-grub
grub-install /dev/disk/by-id/scsi-36000c2932cdb62febff0b5ac93786dd4

zfs snapshot rpool/ROOT/ubuntu@install

exit
mount | grep -v zfs | tac | awk '/\/mnt/ {print $3}' | xargs -i{} umount -lf {}
zpool export rpool

reboot

apt install --yes cryptsetup
echo cryptswap1 /dev/zvol/rpool/swap /dev/urandom swap,cipher=aes-xts-plain64:sha256,size=256 >> /etc/crypttab
systemctl daemon-reload
systemctl start systemd-cryptsetup@cryptswap1.service
echo /dev/mapper/cryptswap1 none swap defaults 0 0 >> /etc/fstab
swapon -av

Swap on ZFS with random key encryption

$ sudo systemctl edit --force --full zfs-cryptswap@.service
# /etc/systemd/system/zfs-cryptswap@.service
[Unit]
Description=ZFS Random Cryptography Setup for %I
Documentation=man:zfs(8)
DefaultDependencies=no
Conflicts=umount.target
IgnoreOnIsolate=true
After=systemd-random-seed.service zfs-volumes.target
BindsTo=dev-zvol-rpool-%i.device
Before=umount.target

[Service]
Type=oneshot
RemainAfterExit=yes
TimeoutSec=0
KeyringMode=shared
OOMScoreAdjust=500
UMask=0077
RuntimeDirectory=zfs-cryptswap.%i
RuntimeDirectoryMode=0700
ExecStartPre=-/sbin/swapoff '/dev/zvol/rpool/%i'
ExecStartPre=-/sbin/zfs destroy 'rpool/%i'
ExecStartPre=/bin/dd if=/dev/urandom of=/run/zfs-cryptswap.%i/%i.key bs=32 count=1
ExecStart=/sbin/zfs create -V 4G -b 8k -o compression=zle -o logbias=throughput -o sync=always -o primarycache=metadata -o secondarycache=none -o com.sun:auto-snapshot=false -o encryption=on -o keyformat=raw -o keylocation=file:///run/zfs-cryptswap.%i/%i.key rpool/%i
ExecStart=/bin/sleep 1
ExecStartPost=/sbin/mkswap '/dev/zvol/rpool/%i'
ExecStartPost=/sbin/swapon '/dev/zvol/rpool/%i'
ExecStop=/sbin/swapoff '/dev/zvol/rpool/%i'
ExecStop=/bin/sleep 2
ExecStopPost=/sbin/zfs destroy 'rpool/%i'

[Install]
WantedBy=swap.target

!!!BE CAREFUL with the name after @ !!!

The name after the @ is the name of the ZFS that will be DESTROYED and recreated!!!

To destroy and recreate an encrypted ZFS volume named cryptswap use:

# systemctl start  zfs-cryptswap@cryptswap.service
# systemctl enable zfs-cryptswap@cryptswap.service
# update-initramfs -k $(uname -i) -u

Kernel settings for ZFS

Set module parameter in /etc/modprobe.d/zfs.conf

options zfs zfs_arc_max=10737418240

# increase them so scrub/resilver is more quickly at the cost of other work
options zfs zfs_vdev_scrub_min_active=24
options zfs zfs_vdev_scrub_max_active=64
# sync write
options zfs zfs_vdev_sync_write_min_active=8
options zfs zfs_vdev_sync_write_max_active=32
# sync reads (normal)
options zfs zfs_vdev_sync_read_min_active=8
options zfs zfs_vdev_sync_read_max_active=32
# async reads : prefetcher
options zfs zfs_vdev_async_read_min_active=8
options zfs zfs_vdev_async_read_max_active=32
# async write : bulk writes
options zfs zfs_vdev_async_write_min_active=8
options zfs zfs_vdev_async_write_max_active=32

# max write speed to l2arc
# tradeoff between write/read and durability of ssd (?)
# default : 8 * 1024 * 1024
# setting here : 500 * 1024 * 1024
options zfs l2arc_write_max=524288000

options zfs zfs_top_maxinflight=512
options zfs zfs_resilver_min_time_ms=8000
options zfs zfs_resilver_delay=0

Remember to update your initramfs before boot. This is the filesystem which is read when your module is loaded.

# update-initramfs -k all -u

Check settings

root@zfshost:~# modprobe -c | grep "options zfs"
options zfs zfs_arc_max=10737418240
options zfs zfs_vdev_scrub_min_active=24
options zfs zfs_vdev_scrub_max_active=64
options zfs zfs_vdev_sync_write_min_active=8
options zfs zfs_vdev_sync_write_max_active=32
options zfs zfs_vdev_sync_read_min_active=8
options zfs zfs_vdev_sync_read_max_active=32
options zfs zfs_vdev_async_read_min_active=8
options zfs zfs_vdev_async_read_max_active=32
options zfs zfs_vdev_async_write_min_active=8
options zfs zfs_vdev_async_write_max_active=32
options zfs l2arc_write_max=524288000
options zfs zfs_top_maxinflight=512
options zfs zfs_resilver_min_time_ms=8000
options zfs zfs_resilver_delay=0
root@zfshost:~# modprobe --show-depends zfs
insmod /lib/modules/4.15.0-58-generic/kernel/spl/spl.ko 
insmod /lib/modules/4.15.0-58-generic/kernel/zfs/znvpair.ko 
insmod /lib/modules/4.15.0-58-generic/kernel/zfs/zcommon.ko 
insmod /lib/modules/4.15.0-58-generic/kernel/zfs/icp.ko 
insmod /lib/modules/4.15.0-58-generic/kernel/zfs/zavl.ko 
insmod /lib/modules/4.15.0-58-generic/kernel/zfs/zunicode.ko 
insmod /lib/modules/4.15.0-58-generic/kernel/zfs/zfs.ko zfs_arc_max=10737418240 zfs_vdev_scrub_min_active=24 zfs_vdev_scrub_max_active=64 zfs_vdev_sync_write_min_active=8 zfs_vdev_sync_write_max_active=32 zfs_vdev_sync_read_min_active=8 zfs_vdev_sync_read_max_active=32 zfs_vdev_async_read_min_active=8 zfs_vdev_async_read_max_active=32 zfs_vdev_async_write_min_active=8 zfs_vdev_async_write_max_active=32 l2arc_write_max=524288000 zfs_top_maxinflight=512 zfs_resilver_min_time_ms=8000 zfs_resilver_delay=0

Check actual settings

Check files in

  • /proc/spl/kstat/zfs/
  • /sys/module/zfs/parameters/

ARC Cache

Get the current usage of cache

# cat /proc/spl/kstat/zfs/arcstats |grep c_
c_min                           4    521779200
c_max                           4    1073741824
arc_no_grow                     4    0
arc_tempreserve                 4    0
arc_loaned_bytes                4    0
arc_prune                       4    25360
arc_meta_used                   4    493285336
arc_meta_limit                  4    805306368
arc_dnode_limit                 4    80530636
arc_meta_max                    4    706551816
arc_meta_min                    4    16777216
sync_wait_for_async             4    357
arc_need_free                   4    0
arc_sys_free                    4    260889600

Limit the cache without reboot non permanent

For example limit it to 512MB (which is too small for production environments, just an example...):

# echo "$[512*1024*1024]" > /sys/module/zfs/parameters/zfs_arc_max

Now you have to drop the caches:

# echo 3 > /proc/sys/vm/drop_caches

Make the cache limit permanent

For example limit it to 512MB (which is too small for production environments, just an example...):

# echo "options zfs zfs_arc_max=$[512*1024*1024]" >> /etc/modprobe.d/zfs.conf

After reboot this value take effect.

Check cache hits/misses

# (while : ; do cat /proc/spl/kstat/zfs/arcstats ; sleep 5 ; done ) | awk '
          BEGIN { 
          }     
          $1 ~ /(hits|misses)/ {
                  name=$1;
                  gsub(/[_]*(hits|misses)/,"",name);
                  if(name == ""){ 
                    name="global";
                  }
          }
          $1 ~ /hits/ {
                  hits[name] = $3 - hitslast[name]
                  hitslast[name] = $3
          }     
          $1 ~ /misses/ {
                  misses[name] = $3 - misslast[name]
                  misslast[name] = $3
                  rate = 0
                  total = hits[name] + misses[name]
                  if (total)
                          rate = (hits[name] * 100) / total
                  if (name=="global")
                    printf "%30s %12s %12s %9s\n", "NAME", "HITS", "MISSES", "HITRATE"

                  printf "%30s %12d %12d %8.2f%%\n", name, hits[name], misses[name], rate
          }     
  '

Higher scrub performance

#!/bin/bash

#
## scrub_fast.sh
#

case $1 in
start)
  echo    0 > /sys/module/zfs/parameters/zfs_scan_idle
  echo    0 > /sys/module/zfs/parameters/zfs_scrub_delay
  echo  512 > /sys/module/zfs/parameters/zfs_top_maxinflight
  echo 5000 > /sys/module/zfs/parameters/zfs_scan_min_time_ms
  echo    4 > /sys/module/zfs/parameters/zfs_vdev_scrub_min_active
  echo    8 > /sys/module/zfs/parameters/zfs_vdev_scrub_max_active
  ;;
stop)
  echo   50 > /sys/module/zfs/parameters/zfs_scan_idle
  echo    4 > /sys/module/zfs/parameters/zfs_scrub_delay
  echo   32 > /sys/module/zfs/parameters/zfs_top_maxinflight
  echo 1000 > /sys/module/zfs/parameters/zfs_scan_min_time_ms
  echo    1 > /sys/module/zfs/parameters/zfs_vdev_scrub_min_active
  echo    2 > /sys/module/zfs/parameters/zfs_vdev_scrub_max_active
  ;;
status)
  for i in zfs_scan_idle zfs_scrub_delay zfs_top_maxinflight zfs_scan_min_time_ms zfs_vdev_scrub_{min,max}_active
  do
    param="/sys/module/zfs/parameters/${i}"
    printf "%60s\t%d\n" "${param}" "$(cat ${param})"
  done
  ;;
*)
  echo "Usage: ${0} (start|stop|status)"
  ;;
esac

More information on zpool status

#!/bin/bash

#
## print_zpool.sh
#

# Written by Lars Timmann <L@rs.Timmann.de> 2022

columns=5 # number of columns for zpool status
if [ ${#} -gt 0 ] && [ ${1} == "iostat" ]
then
  command="iostat -v"
  columns=7
  shift
fi

stdbuf --output=L zpool ${command:-status} -P ${*} | awk -v columns=${columns} '
BEGIN {
  command="lsscsi --scsi_id";
  while( command | getline lsscsi ) {
    count=split(lsscsi,fields);
    dev=fields[count-1];
    scsi_id[dev]=fields[1];
  }
  close(command);
  
  command="ls -Ul /dev/disk/by-id/*";
  while( command | getline ) {
    dev=$NF;
    gsub(/[\.\/]/,"",dev);
    dev_id=$(NF-2);
    device[dev_id]="/dev/"dev;
  }
  close(command);
}
$1 ~ /\/dev\// {
  line=$0;
  dev_by_id=$1;
  dev_no_part=dev_by_id;
  gsub(/(-part|)[0-9]+$/,"",dev_no_part);
  if( NF > 5) {
    count=split(line,a,FS,seps);
    line=seps[0];
    for(i=1;i<columns;i++){
      line=line a[i] seps[i];
    }
    line=line a[columns];
    for(i=columns+1;i<=count;i++){
      rest=rest a[i] seps[i];
    }
  }
  printf("%s %s %s",line,scsi_id[device[dev_no_part]],device[dev_by_id]);
  if(rest!=""){
    printf(" %s",rest);
    rest="";
  }
  printf("\n");
  next;
}
/^errors:/ {
  print;
  fflush();
  next;
}
{ 
  print;
}'

Backup ZFS settings

A little script which may be used on your own risk.

#!/bin/bash

# Written by Lars Timmann <L@rs.Timmann.de> 2018
# Tested on solaris 11.3 & Ubuntu Linux 

# This script is a rotten bunch of code... rewrite it!

AWK_CMD=/usr/bin/gawk
ZPOOL_CMD=/sbin/zpool
ZFS_CMD=/sbin/zfs
ZDB_CMD=/sbin/zdb

function print_local_options () {
  DATASET=$1
  OPTION=$2
  EXCLUDE_REGEX=$3
  ${ZFS_CMD} get -s local -Ho property,value -p ${OPTION} ${DATASET} | while read -r property value
  do
    if [[ ! ${property} =~ ${EXCLUDE_REGEX} ]]
    then
      if [ "_${property}_" == "_share.*_" ]
      then
        print_local_options "${DATASET}" 'share.all' '^$'
      else
        printf '\t-o %s=%s \\\n' "${property}" "${value}"
      fi
    fi
  done
}

function print_filesystem () {
  ZFS=$1

  printf '%s create \\\n' "${ZFS_CMD}"
  print_local_options "${ZFS}" 'all' '^$'
  printf '\t%s\n' "${ZFS}"
}

function print_filesystems () {
  ZPOOL=$1
  for ZFS in $(${ZFS_CMD} list -Ho name -t filesystem -r ${ZPOOL})
  do
    if [ ${ZFS} == ${ZPOOL} ] ; then continue ; fi
    printf '#\n## Filesystem: %s\n#\n\n' "${ZFS}"
    print_filesystem ${ZFS}
    printf '\n'
  done
}

function print_volume () {
  ZVOL=$1
  volsize=$(${ZFS_CMD} get -Ho value volsize ${ZVOL})
  volblocksize=$(${ZFS_CMD} get -Ho value volblocksize ${ZVOL})
  
  printf '%s create \\\n\t-V %s \\\n\t-b %s \\\n' "${ZFS_CMD}" "${volsize}" "${volblocksize}"
  print_local_options "${ZVOL}" 'all' '(volsize|refreservation)'
  printf '\t%s\n' "${ZVOL}"
}

function print_volumes () {
  ZPOOL=$1
  for ZVOL in $(${ZFS_CMD} list -Ho name -t volume -r ${ZPOOL})
  do
    printf '#\n## Volume: %s\n#\n\n' "${ZVOL}"
    print_volume ${ZVOL}
    printf '\n'
  done
}

function print_vdevs () {
  ZPOOL=$1
  ${ZDB_CMD} -C ${ZPOOL} | ${AWK_CMD} -F':' '
    $1 ~ /^[[:space:]]*type$/ {
      gsub(/[ ]+/,"",$NF);
      type=substr($NF,2,length($NF)-2);
      if ( type == "mirror" ) {
        printf " \\\n\t%s",type;
      }
    }
    $1 ~ /^[[:space:]]*path$/ {
      gsub(/[ ]+/,"",$NF);
      vdev=substr($NF,2,length($NF)-2);
      printf " \\\n\t%s",vdev;
    }
    END {
      printf "\n";
    }
  '
}

function print_zpool () {
  ZPOOL=$1
  
  printf '#############################################################\n'
  printf '#\n## ZPool: %s\n#\n' "${ZPOOL}"
  printf '#############################################################\n\n'

  printf '%s create \\\n' "${ZPOOL_CMD}"
  print_local_options "${ZPOOL}" 'all' '/@/'
  printf '\t%s' "${ZPOOL}"
  print_vdevs "${ZPOOL}"
  printf '\n'

  printf '#############################################################\n\n'
  print_filesystems   "${ZPOOL}"
  print_volumes       "${ZPOOL}"
}

OS=$(uname -s)
eval $(uname -s)=1
HOSTNAME=$(hostname)

printf '#############################################################\n'
printf '# Hostname: %s\n' "${HOSTNAME}"
printf '#############################################################\n\n'
for ZPOOL in $(${ZPOOL_CMD} list -Ho name)
do
  print_zpool ${ZPOOL}
done

Links