Latest revision as of 15:50, 25 November 2021

mdadm

Force rebuild of a failed RAID

Example for /dev/md10

The problem: Two failed disks in a RAID5

Looks ugly but maybe we have luck and the disks are just marked as bad.

cat /proc/mdstat

# cat /proc/mdstat
Personalities : [raid1] [raid6] [raid5] [raid4]
...
md10 : inactive sdap1[11] sdao1[5] sdah1[15](S) sdag1[4] sdy1[3] sdz1[14] sdr1[8] sdb1[13] sdq1[16](S) sdi1[1] sda1[12]
       5236577280 blocks super 1.2
...

State is inactive this is not what we want... look for the details in the next step

mdadm --detail

# mdadm --detail /dev/md10
 /dev/md10:
          Version : 1.2
    Creation Time : Wed Feb  6 13:44:52 2013
       Raid Level : raid5
    Used Dev Size : 476052288 (454.00 GiB 487.48 GB)
     Raid Devices : 11
    Total Devices : 11
      Persistence : Superblock is persistent

      Update Time : Wed Jun 15 17:46:57 2016
            State : active, FAILED, Not Started
   Active Devices : 9
 Working Devices : 11
   Failed Devices : 0
    Spare Devices : 2

           Layout : left-symmetric
       Chunk Size : 64K

             Name : md10
             UUID : 82f2b88d:276a1fd3:55a4928e:b2228edf
           Events : 17071

      Number   Major   Minor   RaidDevice State
        11      66      145        0      active sync   /dev/sdap1
         1       8      129        1      active sync   /dev/sdi1
         2       0        0        2      removed
         3      65      129        3      active sync   /dev/sdy1
         4      66        1        4      active sync   /dev/sdag1
         5      66      129        5      active sync   /dev/sdao1
        12       8        1        6      active sync   /dev/sda1
         7       0        0        7      removed
         8      65       17        8      active sync   /dev/sdr1
        13       8       17        9      active sync   /dev/sdb1
        14      65      145       10      active sync   /dev/sdz1

        15      66       17        -      spare   /dev/sdah1
        16      65        1        -      spare   /dev/sdq1

Force the rescan and reassemble the RAID

For a SCSI-rescan you can try this: Scan all SCSI buses for new devices

And you have to do this:

# mdadm --scan /dev/md10
# mdadm --assemble --force --scan
# mdadm --run /dev/md10

Check the status

# mdadm --detail /dev/md10
 
 /dev/md10:
          Version : 1.2
    Creation Time : Wed Feb  6 13:44:52 2013
       Raid Level : raid5
       Array Size : 4760522880 (4539.99 GiB 4874.78 GB)
    Used Dev Size : 476052288 (454.00 GiB 487.48 GB)
     Raid Devices : 11
    Total Devices : 12
      Persistence : Superblock is persistent
 
      Update Time : Thu Jun 16 10:59:16 2016
            State : clean, degraded, recovering
   Active Devices : 10
 Working Devices : 12
   Failed Devices : 0
    Spare Devices : 2
 
           Layout : left-symmetric
       Chunk Size : 64K
 
   Rebuild Status : 5% complete
 
             Name : md10
             UUID : 82f2b88d:276a1fd3:55a4928e:b2228edf
           Events : 17074
 
      Number   Major   Minor   RaidDevice State
        11      66      145        0      active sync   /dev/sdap1
         1       8      129        1      active sync   /dev/sdi1
        16      65        1        2      spare rebuilding   /dev/sdq1
         3      65      129        3      active sync   /dev/sdy1
         4      66        1        4      active sync   /dev/sdag1
         5      66      129        5      active sync   /dev/sdao1
        12       8        1        6      active sync   /dev/sda1
         7       8      145        7      active sync   /dev/sdj1
         8      65       17        8      active sync   /dev/sdr1
        13       8       17        9      active sync   /dev/sdb1
        14      65      145       10      active sync   /dev/sdz1
 
        15      66       17        -      spare   /dev/sdah1

This is good:

State : clean, degraded, recovering

Better wait with the next reboot for completion:

Rebuild Status : 5% complete

It should continue rebuilding if you boot but... know the devils...

Replace a disk in a mirror

Device /dev/cciss/c0d1 is a replaced and new disk in a HP Array Controller

[root@app02 ~]# sfdisk -d /dev/cciss/c0d0 | sfdisk --no-reread --force /dev/cciss/c0d1
[root@app02 ~]# mdadm --manage /dev/md0 --fail /dev/cciss/c0d1p1
[root@app02 ~]# mdadm --manage /dev/md0 --remove /dev/cciss/c0d1p1
[root@app02 ~]# mdadm --manage /dev/md0 --add /dev/cciss/c0d1p1
[root@app02 ~]# mdadm --manage /dev/md1 --fail /dev/cciss/c0d1p2
[root@app02 ~]# mdadm --manage /dev/md1 --remove /dev/cciss/c0d1p2
[root@app02 ~]# mdadm --manage /dev/md1 --add /dev/cciss/c0d1p2
[root@app02 ~]# cat /proc/mdstat 
Personalities : [raid1] 
md1 : active raid1 cciss/c0d1p2[2] cciss/c0d0p2[0]
      36925312 blocks [2/1] [U_]
      	resync=DELAYED
      
md0 : active raid1 cciss/c0d1p1[2] cciss/c0d0p1[0]
      256003712 blocks [2/1] [U_]
      [>....................]  recovery =  0.0% (38144/256003712) finish=2680.2min speed=1589K/sec
      
unused devices: <none>

Linux Software RAID: Difference between revisions

Latest revision as of 15:50, 25 November 2021

Contents

mdadm

Force rebuild of a failed RAID

The problem: Two failed disks in a RAID5

cat /proc/mdstat

mdadm --detail

Force the rescan and reassemble the RAID

Check the status

Replace a disk in a mirror

Navigation menu

Page actions

Page actions

Personal tools

Navigation

Search

Tools

Categories

@@ Line 1: / Line 1: @@
-[[Kategorie:Linux]]
+[[category:Linux]]
 =mdadm=
 ==Force rebuild of a failed RAID==
@@ Line 5: / Line 5: @@
 ===The problem: Two failed disks in a RAID5===
 Looks ugly but maybe we have luck and the disks are just marked as bad.
-<source lang=bash>
+==== cat /proc/mdstat ====
+<syntaxhighlight lang=bash>
+# cat /proc/mdstat
+Personalities : [raid1] [raid6] [raid5] [raid4]
+...
+md10 : inactive sdap1[11] sdao1[5] sdah1[15](S) sdag1[4] sdy1[3] sdz1[14] sdr1[8] sdb1[13] sdq1[16](S) sdi1[1] sda1[12]
+       5236577280 blocks super 1.2
+...
+</syntaxhighlight>
+State is <i>inactive</i> this is not what we want... look for the details in the next step
+==== mdadm --detail ====
+<syntaxhighlight lang=bash>
 # mdadm --detail /dev/md10
   /dev/md10:
@@ Line 45: / Line 57: @@
       66       17        -      spare   /dev/sdah1
       65        1        -      spare   /dev/sdq1
-</source>
+</syntaxhighlight>
 ===Force the rescan and reassemble the RAID===
@@ Line 52: / Line 65: @@
 And you have to do this:
-<source lang=bash>
+<syntaxhighlight lang=bash>
 # mdadm --scan /dev/md10
 # mdadm --assemble --force --scan
 # mdadm --run /dev/md10
-</source>
+</syntaxhighlight>
 ===Check the status===
-<source lang=bash>
+<syntaxhighlight lang=bash>
 # mdadm --detail /dev/md10
@@ Line 101: / Line 114: @@
       66       17        -      spare   /dev/sdah1
-</source>
+</syntaxhighlight>
 This is good:
   State : clean, degraded, recovering
@@ Line 109: / Line 122: @@
 It should continue rebuilding if you boot but... know the devils...
+==Replace a disk in a mirror==
+Device /dev/cciss/c0d1 is a replaced and new disk in a [[HP_Smart_Array_Controller#reenable_disk_after_replacement | HP Array Controller]]
+<syntaxhighlight lang=bash>
+[root@app02 ~]# sfdisk -d /dev/cciss/c0d0 | sfdisk --no-reread --force /dev/cciss/c0d1
+[root@app02 ~]# mdadm --manage /dev/md0 --fail /dev/cciss/c0d1p1
+[root@app02 ~]# mdadm --manage /dev/md0 --remove /dev/cciss/c0d1p1
+[root@app02 ~]# mdadm --manage /dev/md0 --add /dev/cciss/c0d1p1
+[root@app02 ~]# mdadm --manage /dev/md1 --fail /dev/cciss/c0d1p2
+[root@app02 ~]# mdadm --manage /dev/md1 --remove /dev/cciss/c0d1p2
+[root@app02 ~]# mdadm --manage /dev/md1 --add /dev/cciss/c0d1p2
+[root@app02 ~]# cat /proc/mdstat
+Personalities : [raid1]
+md1 : active raid1 cciss/c0d1p2[2] cciss/c0d0p2[0]
+      36925312 blocks [2/1] [U_]
+      	resync=DELAYED
+md0 : active raid1 cciss/c0d1p1[2] cciss/c0d0p1[0]
+      256003712 blocks [2/1] [U_]
+      [>....................]  recovery =  0.0% (38144/256003712) finish=2680.2min speed=1589K/sec
+unused devices: <none>
+</syntaxhighlight>