git.rot13.org Git - gnt-info/blob - doc/2018-07-14-upgrade-to-stretch.txt

   1 # 2018-07-14 upgrade to stretch
   2
   3 Errors after upgrade:
   4
   5 root@r1u28:~# gnt-cluster verify
   6 Sat Jul 14 20:20:15 2018   - ERROR: instance syslog: couldn't retrieve status for disk/0 on r1u32.gnt.ffzg.hr: rbd showmapped failed (exited with exit code 1): rbd: unrecognised option '-p'\n
   7 Sat Jul 14 20:20:15 2018   - ERROR: instance mudrac: couldn't retrieve status for disk/0 on r1u32.gnt.ffzg.hr: rbd showmapped failed (exited with exit code 1): rbd: unrecognised option '-p'\n
   8 Sat Jul 14 20:20:15 2018   - ERROR: instance mudrac: couldn't retrieve status for disk/1 on r1u32.gnt.ffzg.hr: rbd showmapped failed (exited with exit code 1): rbd: unrecognised option '-p'\n
   9 Sat Jul 14 20:20:15 2018   - ERROR: instance mudrac: couldn't retrieve status for disk/2 on r1u32.gnt.ffzg.hr: rbd showmapped failed (exited with exit code 1): rbd: unrecognised option '-p'\n
  10 Sat Jul 14 20:20:15 2018   - ERROR: instance mudrac: couldn't retrieve status for disk/3 on r1u32.gnt.ffzg.hr: rbd showmapped failed (exited with exit code 1): rbd: unrecognised option '-p'\n
  11 Sat Jul 14 20:20:15 2018   - ERROR: instance odin.ffzg.hr: couldn't retrieve status for disk/0 on r1u28.gnt.ffzg.hr: rbd showmapped failed (exited with exit code 1): rbd: unrecognised option '-p'\n
  12 Sat Jul 14 20:20:15 2018   - ERROR: instance odin.ffzg.hr: couldn't retrieve status for disk/1 on r1u28.gnt.ffzg.hr: rbd showmapped failed (exited with exit code 1): rbd: unrecognised option '-p'\n
  13 Sat Jul 14 20:20:15 2018   - ERROR: instance gray: couldn't retrieve status for disk/0 on r1u28.gnt.ffzg.hr: rbd showmapped failed (exited with exit code 1): rbd: unrecognised option '-p'\n
  14 Sat Jul 14 20:20:15 2018   - ERROR: instance video: instance not running on its primary node r1u28.gnt.ffzg.hr
  15
  16 ## rbd -p problem
  17
  18 known issue with option -p: https://github.com/ganeti/ganeti/issues/1233
  19
  20 better version of fix on: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850823
  21
  22
  23 root@r1u28:~# vi /usr/share/ganeti/2.15/ganeti/storage/bdev.py
  24
  25 root@r1u28:~# gnt-cluster copyfile /usr/share/ganeti/2.15/ganeti/storage/bdev.py
  26
  27 root@r1u28:~# /etc/init.d/ganeti restart
  28 Restarting ganeti (via systemctl): ganeti.service.
  29 root@r1u28:~# ssh r1u30 /etc/init.d/ganeti restart
  30 Restarting ganeti (via systemctl): ganeti.service.
  31 root@r1u28:~# ssh r1u32 /etc/init.d/ganeti restart
  32 Restarting ganeti (via systemctl): ganeti.service.
  33
  34 root@r1u28:~# gnt-cluster verify
  35 Submitted jobs 268767, 268768
  36 Waiting for job 268767 ...
  37 Sat Jul 14 20:35:09 2018 * Verifying cluster config
  38 Sat Jul 14 20:35:09 2018 * Verifying cluster certificate files
  39 Sat Jul 14 20:35:09 2018 * Verifying hypervisor parameters
  40 Sat Jul 14 20:35:09 2018 * Verifying all nodes belong to an existing group
  41 Waiting for job 268768 ...
  42 Sat Jul 14 20:35:10 2018 * Verifying group 'default'
  43 Sat Jul 14 20:35:10 2018 * Gathering data (3 nodes)
  44 Sat Jul 14 20:35:11 2018 * Gathering information about nodes (3 nodes)
  45 Sat Jul 14 20:35:13 2018 * Gathering disk information (3 nodes)
  46 Sat Jul 14 20:35:16 2018 * Verifying configuration file consistency
  47 Sat Jul 14 20:35:16 2018 * Verifying node status
  48 Sat Jul 14 20:35:16 2018   - ERROR: cluster: ghost disk '6a41d54a-ab7c-4b99-a99e-529f925135e4' in temporary DRBD map
  49 Sat Jul 14 20:35:16 2018   - ERROR: cluster: ghost disk '6a41d54a-ab7c-4b99-a99e-529f925135e4' in temporary DRBD map
  50 Sat Jul 14 20:35:16 2018 * Verifying instance status
  51 Sat Jul 14 20:35:16 2018   - ERROR: instance video: instance not running on its primary node r1u28.gnt.ffzg.hr
  52 Sat Jul 14 20:35:16 2018 * Verifying orphan volumes
  53 Sat Jul 14 20:35:16 2018 * Verifying N+1 Memory redundancy
  54 Sat Jul 14 20:35:16 2018 * Other Notes
  55 Sat Jul 14 20:35:16 2018 * Hooks Results
  56
  57
  58
  59 ## qemu rbd fix
  60
  61 root@r1u28:~# gnt-ill video
  62 Instance Status     VCPUs Memory DiskUsage Disk_template Disks Primary_node      Secondary_Nodes
  63 video    ERROR_down     -      -    110.0G rbd               2 r1u28.gnt.ffzg.hr
  64 root@r1u28:~# gnt-instance start video
  65 Waiting for job 268771 for video ...
  66 Job 268771 for video has failed: Failure: command execution error:
  67 Could not start instance 'video': Hypervisor error: Failed to start instance video: exited with exit code 1 (qemu-system-x86_64: -drive file=rbd:rbd/7cda803e-f877-4001-b9ce-9d460e3f85b8.rbd.disk0,format=raw,if=none,cache=none,id=hotdisk-39dd05de-pci-4,bus=0,unit=4: Unknown protocol 'rbd'
  68
  69
  70 https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=839899
  71
  72 root@r1u28:~# apt-get install qemu-block-extra
  73 root@r1u28:~# gnt-instance start video
  74 Waiting for job 268773 for video ...
  75 root@r1u28:~# gnt-ill video
  76 Instance Status  VCPUs Memory DiskUsage Disk_template Disks Primary_node      Secondary_Nodes
  77 video    running     4   2.0G    110.0G rbd               2 r1u28.gnt.ffzg.hr
  78
  79
  80 Fix whole cluster and test
  81
  82 root@r1u28:~# gnt-cluster command apt-get install -y qemu-block-extra
  83
  84 # test
  85 root@r1u28:~# gnt-instance reboot video
  86 root@r1u28:~# gnt-instance reboot gray
  87
  88 root@r1u28:~# gnt-ill  | grep ' rbd '
  89 gray                   running        6   6.0G    100.0G rbd               1 r1u28.gnt.ffzg.hr
  90 mudrac                 running        4   4.0G      2.9T rbd               4 r1u32.gnt.ffzg.hr
  91 odin.ffzg.hr           running        2   512M    253.0G rbd               2 r1u28.gnt.ffzg.hr
  92 syslog                 running        2   512M    100.0G rbd               1 r1u32.gnt.ffzg.hr
  93 video                  running        4   2.0G    110.0G rbd               2 r1u28.gnt.ffzg.hr
  94
  95
  96 #XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
  97
  98 # 2018-07-17 problems when creating rbd instance
  99
 100 root@r1u28:/srv/gnt-info# gnt-instance add -t rbd -s 8g -B vcpus=8,minmem=8G,maxmem=8G --no-ip-check --no-name-check -o debootstrap+default docker1
 101 Tue Jul 17 15:09:43 2018  - INFO: Selected nodes for instance docker1 via iallocator hail: r1u32.gnt.ffzg.hr
 102 Tue Jul 17 15:09:44 2018 * disk 0, size 8.0G
 103 Tue Jul 17 15:09:44 2018 * creating instance disks...
 104 Tue Jul 17 15:09:45 2018  - WARNING: Device creation failed
 105 Failure: command execution error:
 106 Can't create block device <Disk(type=rbd, logical_id=('rbd', '0d32cc93-5c42-400c-a27f-90de928ccb80.rbd.disk0'), children=None, visible as /dev/disk/0, size=8192m)> on node r1u32.gnt.ffzg.hr for instance docker1: Can't create block device: rbd map failed (exited with exit code 6): RBD image feature set mismatch. You can disable features unsupported by the kernel with "rbd feature disable".
 107 In some cases useful info is found in syslog - try "dmesg | tail" or so.
 108 rbd: sysfs write failed
 109 rbd: map failed: (6) No such device or address
 110
 111
 112 more info: http://lists.ceph.com/pipermail/ceph-users-ceph.com/2017-February/016554.html
 113
 114
 115 root@r1u28:/srv/gnt-info# ceph -v
 116 ceph version 10.2.5 (c461ee19ecbc0c5c330aca20f7392c9a00730367)
 117
 118 root@r1u32:~# ceph -v
 119 ceph version 10.2.5 (c461ee19ecbc0c5c330aca20f7392c9a00730367)
 120
 121 root@r1u32:~# dmesg | grep unsup
 122
 123 [232021.291731] rbd: image ed02c9cf-b443-4864-b447-30ef32a64689.rbd.disk0: image uses unsupported features: 0x38
 124 [232592.881965] rbd: image 0d32cc93-5c42-400c-a27f-90de928ccb80.rbd.disk0: image uses unsupported features: 0x38
 125 root@r1u32:~# rbd info ed02c9cf-b443-4864-b447-30ef32a64689.rbd.disk0
 126 rbd image 'ed02c9cf-b443-4864-b447-30ef32a64689.rbd.disk0':
 127         size 8192 MB in 2048 objects
 128         order 22 (4096 kB objects)
 129         block_name_prefix: rbd_data.3d78c92ae8944a
 130         format: 2
 131         features: layering, exclusive-lock, object-map, fast-diff, deep-flatten
 132         flags:
 133 root@r1u32:~# rbd info 0d32cc93-5c42-400c-a27f-90de928ccb80.rbd.disk0
 134 rbd image '0d32cc93-5c42-400c-a27f-90de928ccb80.rbd.disk0':
 135         size 8192 MB in 2048 objects
 136         order 22 (4096 kB objects)
 137         block_name_prefix: rbd_data.3d79d1238e1f29
 138         format: 2
 139         features: layering, exclusive-lock, object-map, fast-diff, deep-flatten
 140         flags:
 141
 142 root@r1u32:~# rbd feature disable ed02c9cf-b443-4864-b447-30ef32a64689.rbd.disk0 exclusive-lock,object-map,fast-diff,deep-flatten
 143 2018-07-17 15:27:14.200376 7f08a554c700 -1 librbd::object_map::RefreshRequest: failed to load object map: rbd_object_map.3d78c92ae8944a
 144 2018-07-17 15:27:14.200628 7f08a554c700 -1 librbd::object_map::InvalidateRequest: 0x7f088c00d920 invalidating object map in-memory
 145 2018-07-17 15:27:14.200657 7f08a4d4b700 -1 librbd::object_map::InvalidateRequest: 0x7f088c00d920 should_complete: r=0
 146 2018-07-17 15:27:14.234047 7f08c33b8100 -1 librbd: failed to update features: (22) Invalid argument
 147 rbd: failed to update image features: (22) Invalid argument
 148
 149
 150 # ocke, try to select just layering as default feature for ceph
 151
 152
 153 root@r1u32:/etc# git diff
 154 diff --git a/ceph/ceph.conf b/ceph/ceph.conf
 155 index d977708..116d243 100644
 156 --- a/ceph/ceph.conf
 157 +++ b/ceph/ceph.conf
 158 @@ -7,6 +7,7 @@
 159    auth_client_required = cephx
 160    setuser match path = /var/lib/ceph/$type/$cluster-$id
 161    rbd default format = 2
 162 +  rbd default features = 3
 163
 164  [osd]
 165    osd journal size = 512
 166
 167 # copy this modification to all nodes
 168
 169 root@r1u28:/srv/gnt-info# gnt-cluster copyfile /etc/ceph/ceph.conf
 170
 171 # test to see that it works
 172
 173 root@r1u28:/srv/gnt-info# gnt-instance add -t rbd -s 8g -B vcpus=8,minmem=8G,maxmem=8G --no-ip-check --no-name-check -o debootstrap+default docker1
 174 Tue Jul 17 15:31:55 2018  - INFO: Selected nodes for instance docker1 via iallocator hail: r1u32.gnt.ffzg.hr
 175 Tue Jul 17 15:31:56 2018 * disk 0, size 8.0G
 176 Tue Jul 17 15:31:56 2018 * creating instance disks...
 177 Tue Jul 17 15:31:56 2018 adding instance docker1 to cluster config
 178 Tue Jul 17 15:31:56 2018 adding disks to cluster config
 179 Tue Jul 17 15:31:57 2018  - INFO: Waiting for instance docker1 to sync disks
 180 Tue Jul 17 15:31:57 2018  - INFO: Instance docker1's disks are in sync
 181 Tue Jul 17 15:31:57 2018  - INFO: Waiting for instance docker1 to sync disks
 182 Tue Jul 17 15:31:57 2018  - INFO: Instance docker1's disks are in sync
 183 Tue Jul 17 15:31:57 2018 * running the instance OS create scripts...
 184 Tue Jul 17 15:32:17 2018 * starting instance...
 185