dongxujian 发表于 2016-8-26 22:46:17

openfiler 2.9 ha cluster 集群配置测试



1. 创建系统要求:
hostname: openfiler01
eth0: 192.168.1.155
eth1:10.10.5.155
500MB Meta partition
4GB+ Data partition

hostname: openfiler02
eth0:192.168.1.156
eth1:10.10.5.156
500MB Meta partition
4GB+ Data partition
virtualip:192.168.1.157 ( don't use on any adapter, we will make this later with corosync )

1.1 添加到host文件里面去(openfiler01 openfiler02都执行)
192.168.1.155        openfiler01
192.168.1.156        openfiler02

1.2 建立ssh认证
root@openfiler01 ~# ssh-keygen -t dsa
Generating public/private dsa key pair.
Enter file in which to save the key (/root/.ssh/id_dsa):
Created directory '/root/.ssh'.
Enter passphrase (empty for no passphrase):
Enter same passphrase again:
Your identification has been saved in /root/.ssh/id_dsa.
Your public key has been saved in /root/.ssh/id_dsa.pub.
The key fingerprint is:
Do the same on openfiler02.
root@openfiler02 ~# ssh-keygen -t dsa
Then exchange the files:
root@openfiler01 ~# scp ~/.ssh/id_dsa.pub root@openfiler02:~/.ssh/authorized_keys
root@openfiler02 ~# scp ~/.ssh/id_dsa.pub root@openfiler01:~/.ssh/authorized_keys

2. Create meta/data Partition on both filers


sdb1 83
sdb2 8e

# fdisk -l

Disk /dev/sda: 17.2 GB, 17179869184 bytes
255 heads, 63 sectors/track, 2088 cylinders, total 33554432 sectors
Units = sectors of 1 * 512 = 512 bytes
Sector size (logical/physical): 512 bytes / 512 bytes
I/O size (minimum/optimal): 512 bytes / 512 bytes
Disk identifier: 0x0000425b

   Device Boot      Start         End      Blocks   IdSystem
/dev/sda1   *          63      610469      305203+83Linux
/dev/sda2          610470    17382329   8385930   83Linux
/dev/sda3      17382330    19486844   1052257+82Linux swap / Solaris

Disk /dev/sdb: 21.5 GB, 21474836480 bytes
255 heads, 63 sectors/track, 2610 cylinders, total 41943040 sectors
Units = sectors of 1 * 512 = 512 bytes
Sector size (logical/physical): 512 bytes / 512 bytes
I/O size (minimum/optimal): 512 bytes / 512 bytes
Disk identifier: 0x000ab4f7

   Device Boot      Start         End      Blocks   IdSystem
# fdisk /dev/sdb

Command (m for help): n
Command action
   e   extended
   p   primary partition (1-4)
p
Partition number (1-4, default 1):
Using default value 1
First sector (2048-41943039, default 2048):
Using default value 2048
Last sector, +sectors or +size{K,M,G} (2048-41943039, default 41943039): +500M

Command (m for help): t
Selected partition 1
Hex code (type L to list codes): 83

Command (m for help): n
Command action
   e   extended
   p   primary partition (1-4)
p
Partition number (1-4, default 2):
Using default value 2
First sector (1026048-41943039, default 1026048):
Using default value 1026048
Last sector, +sectors or +size{K,M,G} (1026048-41943039, default 41943039):
Using default value 41943039

Command (m for help): t
Partition number (1-4): 2
Hex code (type L to list codes): 8e
Changed system type of partition 2 to 8e (Linux LVM)

Command (m for help): w
The partition table has been altered!

Calling ioctl() to re-read partition table.
Syncing disks.
# partprobe
# fdisk -l

Disk /dev/sda: 17.2 GB, 17179869184 bytes
255 heads, 63 sectors/track, 2088 cylinders, total 33554432 sectors
Units = sectors of 1 * 512 = 512 bytes
Sector size (logical/physical): 512 bytes / 512 bytes
I/O size (minimum/optimal): 512 bytes / 512 bytes
Disk identifier: 0x0000425b

   Device Boot      Start         End      Blocks   IdSystem
/dev/sda1   *          63      610469      305203+83Linux
/dev/sda2          610470    17382329   8385930   83Linux
/dev/sda3      17382330    19486844   1052257+82Linux swap / Solaris

Disk /dev/sdb: 21.5 GB, 21474836480 bytes
255 heads, 63 sectors/track, 2610 cylinders, total 41943040 sectors
Units = sectors of 1 * 512 = 512 bytes
Sector size (logical/physical): 512 bytes / 512 bytes
I/O size (minimum/optimal): 512 bytes / 512 bytes
Disk identifier: 0x000ab4f7

   Device Boot      Start         End      Blocks   IdSystem
/dev/sdb1            2048   1026047      512000   83Linux
/dev/sdb2         1026048    41943039    20458496   8eLinux LVM

2.1 Create DRBD Setup
Edit /etc/drbd.conf on openfiler01 and openfiler02:(配置drbd的主配置文件)
# cat /etc/drbd.conf
# You can find an example in/usr/share/doc/drbd.../drbd.conf.example
include "drbd.d/global_common.conf";
include "drbd.d/*.res";
resource meta {
on openfiler01 {
device /dev/drbd0;
disk /dev/sdb1;
address 10.10.5.155:7788;
meta-disk internal;
}
on openfiler02 {
device /dev/drbd0;
disk /dev/sdb1;
address 10.10.5.156:7788;
meta-disk internal;
}
}
resource data {
on openfiler01 {
device /dev/drbd1;
disk /dev/sdb2;
address 10.10.5.155:7789;
meta-disk internal;
}
on openfiler02 {
device /dev/drbd1;
disk /dev/sdb2;
address 10.10.5.156:7789;
meta-disk internal;
}
}
然后用drbdadm创建meta和data分区,如果第一次创建有错误提示的话,可以先通过下面的方法操作。然后再创建(注意,这两个分区不能再/etc/fstab分区表中出现。它们是由drbd控制)
dd if=/dev/zero of=/dev/drbdX
root@openfiler01 ~# drbdadm create-md meta
root@openfiler01 ~# drbdadm create-md data
root@openfiler02 ~# drbdadm create-md meta
root@openfiler02 ~# drbdadm create-md data
Now you can start up drbd with:
service drbd start
on both nodes.
Make one node primary:(创建一个主分区)
root@openfiler01 ~# drbdsetup /dev/drbd0 primary -o
root@openfiler01 ~# drbdsetup /dev/drbd1 primary -o

2.2 创建文件系统
root@openfiler01 ~# mkfs.ext3 /dev/drbd0


2.2.1 Openfiler to meta-Partition
在openfiler01上面执行以下脚本
root@openfiler01 ~# service openfiler stop
==============开始=====================
#!/bin/bash

mkdir /meta
mount /dev/drbd0 /meta
mv /opt/openfiler/ /opt/openfiler.local
mkdir /meta/opt
cp -a /opt/openfiler.local /meta/opt/openfiler
ln -s /meta/opt/openfiler /opt/openfiler
rm /meta/opt/openfiler/sbin/openfiler
ln -s /usr/sbin/httpd /meta/opt/openfiler/sbin/openfiler
rm /meta/opt/openfiler/etc/rsync.xml
ln -s /opt/openfiler.local/etc/rsync.xml /meta/opt/openfiler/etc/
mkdir -p /meta/etc/httpd/conf.d

service nfslock stop
umount -a -t rpc-pipefs
mv /etc/samba/ /meta/etc/
ln -s /meta/etc/samba/ /etc/samba
mkdir -p /meta/var/spool
mv /var/spool/samba/ /meta/var/spool/
ln -s /meta/var/spool/samba/ /var/spool/samba
mkdir -p /meta/var/lib
mv /var/lib/nfs/ /meta/var/lib/
ln -s /meta/var/lib/nfs/ /var/lib/nfs
mv /etc/exports /meta/etc/
ln -s /meta/etc/exports /etc/exports
mv /etc/ietd.conf /meta/etc/
ln -s /meta/etc/ietd.conf /etc/ietd.conf
mv /etc/initiators.allow /meta/etc/
ln -s /meta/etc/initiators.allow /etc/initiators.allow
mv /etc/initiators.deny /meta/etc/
ln -s /meta/etc/initiators.deny /etc/initiators.deny
mv /etc/proftpd /meta/etc/
ln -s /meta/etc/proftpd/ /etc/proftpd
rm /opt/openfiler/etc/httpd/modules
ln -s /usr/lib64/httpd/modules /opt/openfiler/etc/httpd/modules
service openfiler start
==============结束=====================
2.2.4 openfiler02 Openfiler Configuration
在openfiler02上面执行以下脚本
# cat start.sh
#!/bin/bash
service openfiler stop
mkdir /meta
mv /opt/openfiler/ /opt/openfiler.local
ln -s /meta/opt/openfiler /opt/openfiler
service nfslock stop
umount -a -t rpc-pipefs
rm -rf /etc/samba/
ln -s /meta/etc/samba/ /etc/samba
rm -rf /var/spool/samba/
ln -s /meta/var/spool/samba/ /var/spool/samba
rm -rf /var/lib/nfs/
ln -s /meta/var/lib/nfs/ /var/lib/nfs
rm -rf /etc/exports
ln -s /meta/etc/exports /etc/exports
rm /etc/ietd.conf
ln -s /meta/etc/ietd.conf /etc/ietd.conf
rm /etc/initiators.allow
ln -s /meta/etc/initiators.allow /etc/initiators.allow
rm /etc/initiators.deny
ln -s /meta/etc/initiators.deny /etc/initiators.deny
rm -rf /etc/proftpd
ln -s /meta/etc/proftpd/ /etc/proftpd


2.3 创建lvm分区
Change the lvm filter in the
/etc/lvm/lvm.conf
file from:
filter = [ "a/.*/" ]
to更改为
filter = [ "a|drbd|", "r|.*|" ]
然后传输到openfiler02上面
root@openfiler01 ~# scp /etc/lvm/lvm.conf root@openfiler02:/etc/lvm/lvm.conf
After that we can create the actual used stuff:(然后创建lvm分区)
root@openfiler01 ~# pvcreate /dev/drbd1
root@openfiler01 ~# vgcreate data /dev/drbd1
root@openfiler01 ~# lvcreate -L 400M -n filer data



3. 开始搭建corosync
3.1 Create Corosync authkey创建双方的认证
root@openfiler01~# corosync-keygen   执行之后等待他的输出,一直到结束!
( Press the real keyboard instead of pressing keys in an ssh terminal. )
Copy the authkey file to the other node and change the fileaccess:
root@openfiler01~# scp /etc/corosync/authkey root@openfiler02:/etc/corosync/authkey
root@openfiler02~# chmod 400 /etc/corosync/authkey

3.2 创建 pcmk /etc/corosync/service.d/pcmk
root@openfiler01~# vi /etc/corosync/service.d/pcmk
service {
      # Load the Pacemaker Cluster Resource Manager
      name: pacemaker
      ver:0
}

3.2.1 拷贝到openfiler02上面
root@openfiler01~# scp /etc/corosync/service.d/pcmk root@openfiler02:/etc/corosync/service.d/pcmk

3.3 Create the corosync.conf file and change it to present your lan net ( bindnetaddr )
# cat /etc/corosync/corosync.conf
# Please read the corosync.conf.5 manual page
compatibility: whitetank
totem {
      version: 2
      secauth: off
      threads: 0
      interface {
                ringnumber: 0
                bindnetaddr: 10.10.5.0(心跳线的广播域)
                mcastaddr: 226.94.8.8   (组播地址 选取这个段的)
                mcastport: 5405
                ttl: 1
      }
}
logging {
      fileline: off
      to_stderr: no
      to_logfile: yes
      to_syslog: yes
      logfile: /var/log/cluster/corosync.log
      debug: off
      timestamp: on
      logger_subsys {
                subsys: AMF
                debug: off
      }
}
amf {
      mode: disabled
                }

3.3.1 拷贝一份到openfiler02
root@openfiler01~# scp /etc/corosync/corosync.conf root@openfiler02:/etc/corosync/corosync.conf

4.准备corosync配置
首先,我们准备重启机器,然后把以下的服务停掉随机启动,因为我们需要用corosync来控制他们

root@openfiler01~# chkconfig --level 2345 openfiler off
root@openfiler01~# chkconfig --level 2345 nfslock off
root@openfiler01~# chkconfig --level 2345 corosync on
俩个节点都要执行:
root@openfiler02~# chkconfig --level 2345 openfiler off
root@openfiler02~# chkconfig --level 2345 nfslock off
root@openfiler02~# chkconfig --level 2345 corosync on
然后重启机器,等待....

4.1 Check if corosync started properly查看corosync是否启动正常
root@openfiler01~# ps auxf
root@openfiler01~# ps auxf
root      34800.00.8 5344564112 ?      Ssl19:15   0:00 corosync
root      34860.00.5681722776 ?      S    19:15   0:00\_ /usr/lib64/heartbeat/stonith
106       34870.01.0676844956 ?      S    19:15   0:00\_ /usr/lib64/heartbeat/cib
root      34880.00.4708282196 ?      S    19:15   0:00\_ /usr/lib64/heartbeat/lrmd
106       34890.00.6685363096 ?      S    19:15   0:00\_ /usr/lib64/heartbeat/attrd
106       34900.00.6690643420 ?      S    19:15   0:00\_ /usr/lib64/heartbeat/pengine
106       34910.00.7767643488 ?      S    19:15   0:00\_ /usr/lib64/heartbeat/crmd
root@openfiler02~# crm_mon --one-shot -V
crm_mon: 2011/03/24_19:32:07 ERROR: unpack_resources: Resource start-up disabled since no STONITH resources have been defined
crm_mon: 2011/03/24_19:32:07 ERROR: unpack_resources: Either configure some or disable STONITH with the stonith-enabled option
crm_mon: 2011/03/24_19:32:07 ERROR: unpack_resources: NOTE: Clusters with shared data need STONITH to ensure data integrity
============
Last updated: Thu Mar 24 19:32:07 2011
Stack: openais
Current DC: openfiler01 - partition with quorum
Version: 1.1.2-c6b59218ee949eebff30e837ff6f3824ed0ab86b
2 Nodes configured, 2 expected votes
0 Resources configured.
============

Online: [ openfiler01 openfiler02 ]

4.2 Configure Corosync as following配置corosync
Now before do monitor the status of starting the cluster on openfiler02:
root@openfiler02~# crm_mon

4.2.1 Howto configure corosync step by step
root@openfiler01~# crm configure
crm(live)configure# property stonith-enabled="false"
crm(live)configure# property no-quorum-policy="ignore"
crm(live)configure# rsc_defaults $id="rsc-options" resource-stickiness="100"
crm(live)configure# primitive ClusterIP ocf:heartbeat:IPaddr2 params ip="192.168.1.157" cidr_netmask="24" op monitor interval="30s"
crm(live)configure# primitive MetaFS ocf:heartbeat:Filesystemparams device="/dev/drbd0" directory="/meta" fstype="ext3"
#crm(live)configure# primitive lvmdata ocf:heartbeat:LVMparams volgrpname="data"
crm(live)configure# primitive drbd_meta ocf:linbit:drbd params drbd_resource="meta"op monitor interval="15s"
crm(live)configure# primitive drbd_data ocf:linbit:drbdparams drbd_resource="data"op monitor interval="15s"
crm(live)configure# primitive openfiler lsb:openfiler
crm(live)configure# primitive iscsi lsb:iscsi-target
#crm(live)configure# primitive samba lsb:smb
#crm(live)configure# primitive nfs lsb:nfs
#crm(live)configure# primitive nfslock lsb:nfslock
crm(live)configure# group g_drbd drbd_meta drbd_data
crm(live)configure# group g_services MetaFS lvmdata openfiler ClusterIP iscsi samba nfs nfslock
crm(live)configure# ms ms_g_drbd g_drbdmeta master-max="1" master-node-max="1" clone-max="2" clone-node-max="1" notify="true"
crm(live)configure# colocation c_g_services_on_g_drbd inf: g_services ms_g_drbd:Master
crm(live)configure# order o_g_servicesafter_g_drbd inf: ms_g_drbd:promote g_services:start
crm(live)configure# commit

Watch now on the monitor process how the resources all start hopefully.
root@openfiler01 ~# crm_mon
以上会有warn提示,但是不是报错,没有关系

4.2.2 Troubleshooting
If you get any errors because you done commit before the end of the config, then you need to do a cleanup, as in this example:
root@openfiler01~# crm
crm(live)resource cleanup MetaFS

4.2.3 Verify the config
验证你的配置信息,通过输入
# crm configure show
node openfiler01
node openfiler02
primitive ClusterIP ocf:heartbeat:IPaddr2 \
        params ip="192.168.1.157" cidr_netmask="24" \
        op monitor interval="30s"
primitive MetaFS ocf:heartbeat:Filesystem \
        params device="/dev/drbd0" directory="/meta" fstype="ext3"
primitive drbd_data ocf:linbit:drbd \
        params drbd_resource="data" \
        op monitor interval="15s"
primitive drbd_meta ocf:linbit:drbd \
        params drbd_resource="meta" \
        op monitor interval="15s"
primitive iscsi lsb:iscsi-target
primitive lvmdata ocf:heartbeat:LVM \
        params volgrpname="data"
primitive nfs lsb:nfs
primitive nfslock lsb:nfslock
primitive openfiler lsb:openfiler
primitive samba lsb:smb
group g_drbd drbd_meta drbd_data
group g_services MetaFS lvmdata openfiler ClusterIP iscsi samba nfs nfslock
ms ms_g_drbd g_drbd \
        meta master-max="1" master-node-max="1" clone-max="2" clone-node-max="1" notify="true"
colocation c_g_services_on_g_drbd inf: g_services ms_g_drbd:Master
order o_g_servicesafter_g_drbd inf: ms_g_drbd:promote g_services:start
property $id="cib-bootstrap-options" \
        dc-version="1.1.2-c6b59218ee949eebff30e837ff6f3824ed0ab86b" \
        cluster-infrastructure="openais" \
        expected-quorum-votes="2" \
        stonith-enabled="false" \
        no-quorum-policy="ignore"
rsc_defaults $id="rsc-options" \
        resource-stickiness="100"

然后你可以通过输入
crm_mon来查看状态。服务是不是正常启动

Last updated: Mon Dec 17 10:40:54 2012
Stack: openais
Current DC: openfiler01 - partition with quorum
Version: 1.1.2-c6b59218ee949eebff30e837ff6f3824ed0ab86b
4 Nodes configured, 2 expected votes
2 Resources configured.
============

Online: [ openfiler01 openfiler02 ]

Resource Group: g_services
   MetaFS   (ocf::heartbeat:Filesystem):    Started openfiler01
   lvmdata    (ocf::heartbeat:LVM):            Started openfiler01
   openfiler(lsb:openfiler):                        Started openfiler01
   ClusterIP(ocf::heartbeat:IPaddr2):         Started openfiler01
   iscsi      (lsb:iscsi-target):                           Started openfiler01
   samba      (lsb:smb):                                 Started openfiler01
   nfs      (lsb:nfs):                                          Started openfiler01
   nfslock    (lsb:nfslock):                              Started openfiler01
   Master/Slave Set: ms_g_drbd
   Masters: [ openfiler01 ]
   Slaves: [ openfiler02 ]
   
   
   
   
5.模拟心跳down机实验以及恢复
5.1 在目前主机上面建立crontab任务
问题:drbd脑裂问题,当心跳线断掉的时候,两个节点同时可以对外提供服务,通过corosync管理服务机制出现问题,双方都对外提供服务,会导致最终数据产生不一致。
按照上面实验openfiler01作为主机对外提供服务,在openfiler01上面添加crontab任务,当检测到自己网卡down掉,或者是心跳线出现问题,
在验证属于自己问题后,将集群服务停掉,自动退出集群节点。

# cat drbd.sh
#!/bin/bash
#env > /root/bbb
/sbin/ifconfig | grepeth1
if [ $? = 0 ]
then
echo " $(date +%F/%H:%M) : eth1 OK"
else
echo "$(date +%F/%H:%M): eth1 down ,stop service" >> /root/fail.txt
pkill corosync
fi
ping -c110.10.5.156 &> /dev/null
if [ $? = 0 ]
then
echo " $(date +%F/%H:%M) : ping OK"
else
ping -c1 192.168.11.1 &> /dev/null
if [ $? = 0 ]
then
      echo "$(date +%F/%H:%M) : ping 11.1OK 223 failed" >> /root/alter.txt
else
   echo"$(date +%F/%H:%M): ping failed , stop service " >> /root/fail.txt
   pkillcorosync
fi
fi
添加到crontab,时时检测问题。
#crontab -l
* * * * * sh/root/drbd.sh &

5.2 进行openfiler01 down机实验

在openfiler01上面down掉网卡,通过检测到网卡down之后。
#ifdown eth1
openfiler01会自动关闭corosync服务,脱离集群,这个时候通过
https://192.168.1.157:446
查看openfiler时正常对外提供服务的,一直ping 虚拟IP(192.168.1.157)也查看到没有超时的状况,符合我们本来的需求。
当openfiler01恢复的时候,需要重新添加到集群里面。
以下操作需要在openfiler01上面执行:
首先设置openfiler01作为备机出现
# drbdadm secondary all(在drbd.conf中设置的名称,可以用all)
# drbdadm disconnect all   (备机断掉可以用all)
# drbdadm ----discard-my-data connect all(同步数据连接上)
现在查看drbd的状态会发现时 diskless/uptodate 解决diskless需要运行下面命令
#drbdadmattach all(解决diskless问题)
# drbdadm invalidate all(从头重新同步数据,这个持续时间会很长)
通过以上恢复步骤,openfiler01会重新添加进来,然后记住手动启动起来corosync服务
然后主备的身份完成切换了,如果需要更换回到openfiler01,需要手动将openfiler02的网卡down掉按照上述过程重新切换回去即可。


问题:

# crm status
============
Last updated: Fri Jun 10 08:13:37 2016
Stack: openais
Current DC: openfiler01 - partition with quorum
Version: 1.1.2-c6b59218ee949eebff30e837ff6f3824ed0ab86b
2 Nodes configured, 2 expected votes
3 Resources configured.
============

Online: [ openfiler01 openfiler02 ]

Resource Group: g_services
   MetaFS        (ocf::heartbeat:Filesystem):        Started openfiler02
   lvmdata        (ocf::heartbeat:LVM):        Started openfiler02
   openfiler        (lsb:openfiler):        Started openfiler02
   ClusterIP        (ocf::heartbeat:IPaddr2):        Started openfiler02
   iscsi        (lsb:iscsi-target):        Started openfiler02
   samba        (lsb:smb):        Started openfiler02
   nfs        (lsb:nfs):        Started openfiler02
Master/Slave Set: ms_g_drbd
   Masters: [ openfiler02 ]
   Slaves: [ openfiler01 ]

Failed actions:
    nfslock_start_0 (node=openfiler02, call=16, rc=1, status=complete): unknown error
    nfslock_start_0 (node=openfiler01, call=12, rc=1, status=complete): unknown error


调整:

# crm configure edit

node openfiler01
node openfiler02
primitive ClusterIP ocf:heartbeat:IPaddr2 \
      params ip="192.168.1.157" cidr_netmask="24" \
      op monitor interval="30s"
primitive MetaFS ocf:heartbeat:Filesystem \
      params device="/dev/drbd0" directory="/meta" fstype="ext3"
primitive drbd_data ocf:linbit:drbd \
      params drbd_resource="data" \
      op monitor interval="15s"
primitive drbd_meta ocf:linbit:drbd \
      params drbd_resource="meta" \
      op monitor interval="15s"
primitive iscsi lsb:iscsi-target
primitive lvmdata ocf:heartbeat:LVM \
      params volgrpname="data"
primitive nfs lsb:nfs
primitive nfslock lsb:nfslock
primitive openfiler lsb:openfiler
primitive samba lsb:smb
group g_drbd drbd_meta drbd_data
group g_services MetaFS lvmdata openfiler ClusterIP iscsi samba nfs nfslock

nfs 后面添加nfslock

:wq
commit
页: [1]
查看完整版本: openfiler 2.9 ha cluster 集群配置测试