git.rot13.org Git - gnt-info/blob - doc/megaraid-to-md.txt

   1 In our testing, md raid10 is about 30% faster than megaraid's internal raid10.
   2
   3 First, let's test original configuration. It 6*1T disks and one SSD before and after bios update.
   4
   5 root@lib24:~# lsblk --scsi -m
   6 NAME HCTL       TYPE VENDOR   MODEL             REV TRAN NAME   SIZE OWNER GROUP MODE
   7 sdb  0:0:7:0    disk ATA      INTEL SSDSC2BW24 DC32      sdb  223.6G root  disk  brw-rw----
   8 sdc  0:2:0:0    disk DELL     PERC H310        2.12      sdc    2.7T root  disk  brw-rw----
   9
  10 root@lib24:~# dmidecode | grep 'BIOS Rev'
  11         BIOS Revision: 2.5
  12
  13 root@lib24:~# hdparm -tT /dev/sd[bc]
  14
  15 /dev/sdb:
  16  Timing cached reads:   13750 MB in  2.00 seconds = 6890.40 MB/sec
  17  Timing buffered disk reads: 950 MB in  3.00 seconds = 316.44 MB/sec
  18
  19 /dev/sdc:
  20  Timing cached reads:   13304 MB in  1.99 seconds = 6670.01 MB/sec
  21  Timing buffered disk reads: 1332 MB in  3.00 seconds = 443.81 MB/sec
  22
  23
  24
  25 root@lib24:~# dmidecode | grep 'BIOS Rev'
  26         BIOS Revision: 2.7
  27 root@lib24:~# lsblk --scsi -m
  28 NAME HCTL       TYPE VENDOR   MODEL             REV TRAN NAME   SIZE OWNER GROUP MODE
  29 sda  0:0:6:0    disk ATA      WDC WD1002FBYS-1 0C12      sda  931.5G root  disk  brw-rw----
  30 sdb  0:0:7:0    disk ATA      INTEL SSDSC2BW24 DC32      sdb  223.6G root  disk  brw-rw----
  31 sdc  0:2:0:0    disk DELL     PERC H310        2.12      sdc    2.7T root  disk  brw-rw----
  32 root@lib24:~# hdparm -tT /dev/sd[abc]
  33
  34 /dev/sda:
  35  Timing cached reads:   13960 MB in  1.99 seconds = 7002.59 MB/sec
  36  Timing buffered disk reads: 320 MB in  3.00 seconds = 106.57 MB/sec
  37
  38 /dev/sdb:
  39  Timing cached reads:   14004 MB in  1.99 seconds = 7024.47 MB/sec
  40  Timing buffered disk reads: 962 MB in  3.00 seconds = 320.66 MB/sec
  41
  42 /dev/sdc:
  43  Timing cached reads:   13920 MB in  1.99 seconds = 6981.76 MB/sec
  44  Timing buffered disk reads: 1356 MB in  3.00 seconds = 451.81 MB/sec
  45
  46
  47
  48
  49
  50
  51 Here are steps to replace hardware megaraid with software md5 on ganeti.
  52
  53
  54
  55
  56 root@lib30:/srv/gnt-info# gnt-node migrate lib28
  57
  58 root@lib30:/srv/gnt-info# gnt-node modify --drained yes lib28
  59
  60 root@lib30:/srv/gnt-info# hbal -L -X
  61
  62
  63
  64 This will migrate instances from node, marked it as drained and then hbal will re-distribute drbd disks
  65 from it to rest of cluster.
  66
  67 Alternative is to use modify -t plain, but that requires instance reboot.
  68
  69
  70 root@lib28:~# vgremove ffzgvg
  71
  72 root@lib28:~# pvremove /dev/bcache0
  73
  74
  75
  76 Then, we need to find bcache block device and deactiate it:
  77
  78 root@lib28:~# cd /sys/fs/bcache/
  79 root@lib28:/sys/fs/bcache# cd 4a61e966-f18d-4fab-836a-3ff027963781/
  80
  81 root@lib28:/sys/fs/bcache/4a61e966-f18d-4fab-836a-3ff027963781# ls -al bdev0
  82 lrwxrwxrwx 1 root root 0 Sep 19 19:46 bdev0 -> ../../../devices/pci0000:00/0000:00:02.2/0000:03:00.0/host0/target0:2:0/0:2:0:0/block/sdb/bcache
  83
  84
  85 root@lib28:~# echo 1 > /sys/block/sdb/bcache/stop
  86 [Wed Sep 19 20:05:53 2018] bcache: bcache_device_free() bcache0 stopped
  87
  88 root@lib28:~# megacli -CfgDsply -a0
  89
  90
  91 Erase existing config
  92
  93 root@lib28:~# megacli -CfgLdDel -L0 -a0
  94
  95 Adapter 0: Deleted Virtual Drive-0(target id-0)
  96
  97 Exit Code: 0x00
  98
  99
 100 Convert all disks to jbod
 101
 102 root@lib28:~# megacli -PDMakeJBOD -PhysDrv[32:0] -a0
 103
 104 Adapter: 0: EnclId-32 SlotId-0 state changed to JBOD.
 105
 106 Exit Code: 0x00
 107
 108 root@lib28:~# megacli -PDMakeJBOD -PhysDrv[32:1] -a0
 109
 110 ...
 111
 112 megacli -PDMakeJBOD -PhysDrv[32:6] -a0
 113
 114 Adapter: 0: Failed to change PD state at EnclId-32 SlotId-6.
 115
 116 Exit Code: 0x01
 117
 118
 119 This probably means that it has foreign config (makes sense, 7th disk
 120 comes from another server with raid on it):
 121
 122 root@lib28:~# megacli -CfgForeign -Clear 0 -a0
 123
 124 Foreign configuration 0 is cleared on controller 0.
 125
 126 Exit Code: 0x00
 127
 128
 129 root@lib28:~# megacli -PDMakeJBOD -PhysDrv[32:6] -a0
 130
 131 Adapter: 0: EnclId-32 SlotId-6 state changed to JBOD.
 132
 133 Exit Code: 0x00
 134
 135
 136 Let's now wipe disks which where detected as bcache:
 137
 138 root@lib24:~# dmesg | grep register_bdev
 139 [    4.941396] bcache: register_bdev() registered backing device sdc
 140 [  496.314350] bcache: register_bdev() registered backing device sdc
 141 [  510.221356] bcache: register_bdev() registered backing device sdd
 142
 143 root@lib24:~# wipefs /dev/sd[cd]
 144 offset               type
 145 ----------------------------------------------------------------
 146 0x1018               bcache   [other]
 147                      UUID:  08a9ea8e-0a25-4efd-9075-8966c0d1fb00
 148
 149 offset               type
 150 ----------------------------------------------------------------
 151 0x1018               bcache   [other]
 152                      UUID:  08a9ea8e-0a25-4efd-9075-8966c0d1fb00
 153
 154
 155 If there are partition tables on disks, they should be wiped with wipefs -a /dev/drive
 156
 157 It's probably a good idea to reboot here so that linux kernel can re-numerate disks.
 158
 159
 160
 161
 162 root@lib28:~# lsblk --scsi -m
 163 NAME HCTL       TYPE VENDOR   MODEL             REV TRAN NAME   SIZE OWNER GROUP MODE
 164 sda  0:0:7:0    disk ATA      INTEL SSDSC2BW24 DC32      sda  223.6G root  disk  brw-rw----
 165 sdb  0:0:0:0    disk ATA      ST1000NM0033-9ZM GA0A      sdb  931.5G root  disk  brw-rw----
 166 sdc  0:0:1:0    disk ATA      ST1000NM0033-9ZM GA0A      sdc  931.5G root  disk  brw-rw----
 167 sdd  0:0:2:0    disk ATA      ST1000NM0033-9ZM GA0A      sdd  931.5G root  disk  brw-rw----
 168 sde  0:0:3:0    disk ATA      ST1000NM0033-9ZM GA0A      sde  931.5G root  disk  brw-rw----
 169 sdf  0:0:4:0    disk ATA      ST1000NM0033-9ZM GA0A      sdf  931.5G root  disk  brw-rw----
 170 sdg  0:0:5:0    disk ATA      ST1000NM0033-9ZM GA0A      sdg  931.5G root  disk  brw-rw----
 171 sdh  0:0:6:0    disk ATA      ST1000NM0033-9ZM GA0A      sdh  931.5G root  disk  brw-rw----
 172
 173
 174 If you didn't wipe old bcache data you might get error when creating array:
 175
 176 root@lib28:~# mdadm --create /dev/md0 --level=10 --raid-devices=7 /dev/sd{b,c,d,e,f,g,h}
 177 mdadm: cannot open /dev/sdb: Device or resource busy
 178 root@lib28:~# lsblk
 179 NAME      MAJ:MIN RM   SIZE RO TYPE MOUNTPOINT
 180 sda         8:0    0 223.6G  0 disk
 181 ├─sda1      8:1    0   260M  0 part /boot/efi
 182 ├─sda2      8:2    0    10G  0 part /
 183 └─sda3      8:3    0 213.3G  0 part
 184 sdb         8:16   0 931.5G  0 disk
 185 └─bcache0 254:0    0 931.5G  0 disk
 186 sdc         8:32   0 931.5G  0 disk
 187 └─bcache1 254:1    0 931.5G  0 disk
 188 sdd         8:48   0 931.5G  0 disk
 189 sde         8:64   0 931.5G  0 disk
 190 sdf         8:80   0 931.5G  0 disk
 191 sdg         8:96   0 931.5G  0 disk
 192 sdh         8:112  0 931.5G  0 disk
 193
 194 Duh!
 195
 196 root@lib28:~# wipefs /dev/sda
 197 offset               type
 198 ----------------------------------------------------------------
 199 0x1018               bcache   [other]
 200                      UUID:  2dbf6232-0e89-4d7b-87c7-79897b0a34f2
 201
 202 root@lib28:~# wipefs /dev/sdb
 203 offset               type
 204 ----------------------------------------------------------------
 205 0x1018               bcache   [other]
 206                      UUID:  2dbf6232-0e89-4d7b-87c7-79897b0a34f2
 207
 208 root@lib28:~# echo 1 > /sys/block/sda/bcache/stop
 209 root@lib28:~# [Wed Sep 19 20:25:14 2018] bcache: bcache_device_free() bcache0 stopped
 210
 211 root@lib28:~# echo 1 > /sys/block/sdb/bcache/stop
 212 root@lib28:~# [Wed Sep 19 20:25:20 2018] bcache: bcache_device_free() bcache1 stopped
 213
 214 root@lib28:~# lsblk
 215 NAME   MAJ:MIN RM   SIZE RO TYPE MOUNTPOINT
 216 sda      8:0    0 931.5G  0 disk
 217 sdb      8:16   0 931.5G  0 disk
 218 sdc      8:32   0 931.5G  0 disk
 219 sdd      8:48   0 931.5G  0 disk
 220 sde      8:64   0 931.5G  0 disk
 221 sdf      8:80   0 931.5G  0 disk
 222 sdg      8:96   0 931.5G  0 disk
 223 sdh      8:112  0 223.6G  0 disk
 224 ├─sdh1   8:113  0   260M  0 part /boot/efi
 225 ├─sdh2   8:114  0    10G  0 part /
 226 └─sdh3   8:115  0 213.3G  0 part
 227
 228 root@lib28:~# mdadm --create /dev/md0 --level=10 --raid-devices=7 /dev/sd{a,b,c,d,e,f,g}
 229 mdadm: Defaulting to version 1.2 metadata
 230 mdadm: array /dev/md0 started.
 231
 232 ...
 233
 234 Wait for rebuild, install bios update, reboot as needed.
 235
 236 root@lib28:~# dmidecode | grep 'BIOS Rev'
 237         BIOS Revision: 2.7
 238
 239 root@lib28:~# hdparm -Tt /dev/sdh /dev/sda /dev/md0
 240
 241 # ssd
 242 /dev/sdh:
 243  Timing cached reads:   13856 MB in  1.99 seconds = 6950.11 MB/sec
 244  Timing buffered disk reads: 940 MB in  3.00 seconds = 313.22 MB/sec
 245
 246 # single drive
 247 /dev/sda:
 248  Timing cached reads:   13722 MB in  1.99 seconds = 6883.27 MB/sec
 249  Timing buffered disk reads: 572 MB in  3.01 seconds = 190.06 MB/sec
 250
 251 # raid10, 7 disks
 252 /dev/md0:
 253  Timing cached reads:   13826 MB in  1.99 seconds = 6935.19 MB/sec
 254  Timing buffered disk reads: 1888 MB in  3.01 seconds = 628.05 MB/sec
 255
 256
 257 Now we need to create new backing block device for bcache:
 258
 259 root@lib28:~# cat /proc/mdstat
 260 Personalities : [raid10] [linear] [multipath] [raid0] [raid1] [raid6] [raid5] [raid4]
 261 md127 : active (auto-read-only) raid10 sda[0] sdc[2] sdd[3] sdf[5] sdg[6] sdb[1] sde[4]
 262       3418209280 blocks super 1.2 512K chunks 2 near-copies [7/7] [UUUUUUU]
 263       bitmap: 0/26 pages [0KB], 65536KB chunk
 264
 265 root@lib28:~# make-bcache -B /dev/md127
 266 UUID:                   1c2c022b-5ed1-48d2-9ec9-499465af71a8
 267 Set UUID:               83ad5994-b266-4e4b-ab6d-9be8fe419e62
 268 version:                1
 269 block_size:             1
 270 data_offset:            16
 271 [Thu Sep 20 02:23:12 2018] bcache: register_bdev() registered backing device md127
 272
 273
 274 Now attach cache device back to it:
 275
 276 root@lib28:~# bcache-super-show /dev/sdh3 | grep cset.uuid
 277 cset.uuid               4a61e966-f18d-4fab-836a-3ff027963781
 278
 279 root@lib28:~# echo 4a61e966-f18d-4fab-836a-3ff027963781 > /sys/block/bcache0/bcache/attach
 280 [Thu Sep 20 14:18:08 2018] bcache: bch_cached_dev_attach() Caching md127 as bcache0 on set 4a61e966-f18d-4fab-836a-3ff027963781
 281
 282 Same as one-liner for easier copy/paste:
 283
 284 root@lib30:/srv/gnt-info# bcache-super-show /dev/sdh3 | grep cset.uuid | awk '{ print $2 }' > /sys/block/bcache0/bcache/attach
 285 [Mon Sep 24 09:38:35 2018] bcache: bch_cached_dev_attach() Caching md0 as bcache0 on set b8b500d6-b933-428b-a040-5fb0b2cbef49
 286
 287
 288
 289 Verify that bcache is active (values should NOT be all zeros if it's working):
 290
 291 root@lib28:~# grep . /sys/fs/bcache/*/stats_total/*
 292 /sys/fs/bcache/4a61e966-f18d-4fab-836a-3ff027963781/stats_total/bypassed:40.8M
 293 /sys/fs/bcache/4a61e966-f18d-4fab-836a-3ff027963781/stats_total/cache_bypass_hits:0
 294 /sys/fs/bcache/4a61e966-f18d-4fab-836a-3ff027963781/stats_total/cache_bypass_misses:11
 295 /sys/fs/bcache/4a61e966-f18d-4fab-836a-3ff027963781/stats_total/cache_hit_ratio:0
 296 /sys/fs/bcache/4a61e966-f18d-4fab-836a-3ff027963781/stats_total/cache_hits:0
 297 /sys/fs/bcache/4a61e966-f18d-4fab-836a-3ff027963781/stats_total/cache_miss_collisions:0
 298 /sys/fs/bcache/4a61e966-f18d-4fab-836a-3ff027963781/stats_total/cache_misses:128
 299 /sys/fs/bcache/4a61e966-f18d-4fab-836a-3ff027963781/stats_total/cache_readaheads:0
 300
 301 root@lib26:~# cat /sys/block/bcache0/bcache/state
 302 clean
 303
 304
 305 Optionally you might want to turn on writeback mode and then state will change
 306
 307 root@lib28:~# echo writeback > /sys/block/bcache0/bcache/cache_mode
 308 root@lib28:~# cat /sys/block/bcache0/bcache/state
 309 dirty
 310
 311
 312
 313
 314 root@lib28:~# pvcreate /dev/bcache0
 315   Configuration setting "activation/thin_check_executable" unknown.
 316   Physical volume "/dev/bcache0" successfully created.
 317
 318 root@lib28:~# vgcreate ffzgvg /dev/bcache0
 319   Configuration setting "activation/thin_check_executable" unknown.
 320   Volume group "ffzgvg" successfully created
 321
 322
 323 root@lib30:~# gnt-node modify --drained no lib28
 324
 325
 326 We should run hbal now to re-balance drbd, but we will first migrate all instances from next
 327 node to upgrade to save one copy of data over cluster
 328
 329 root@lib30:~# gnt-node migrate lib24
 330
 331 root@lib30:~# gnt-node modify --drained yes lib24
 332
 333 root@lib30:~# hbal -L -X
 334