openshift 4.12 UPI in agent way, single node

OpenShift的安装方式很多了,现在又多了一种,agent based installer。最大的特点是,不需要额外的bootstrap节点了。这可是天大的好消息,因为,以前安装之前,和客户交流,客户总是不理解,为什么红帽说支持3节点部署,但是却要求提供4台服务器。也不能怪客户,按照一般的理解,之前红帽是不支持严格意义上的3节点部署,就因为有这个bootstrap. 现在好了,agent based installer是真正世俗意义上的支持3节点部署了。

从官方文档来看,能压缩掉bootstrap,是因为bootstrap相关的服务,都压缩到一个master节点上,并使用了assisted installer流程,来达到真正的3节点安装的。

  • https://docs.openshift.com/container-platform/4.12/installing/installing_with_agent_based_installer/preparing-to-install-with-agent-based-installer.html

本文,就用agent based installer来装一个单节点的ocp集群。

on helper node


# switch to you install version

export BUILDNUMBER=4.12.9

pushd /data/ocp4/${BUILDNUMBER}
tar -xzf openshift-client-linux-${BUILDNUMBER}.tar.gz -C /usr/local/bin/
tar -xzf openshift-install-linux-${BUILDNUMBER}.tar.gz -C /usr/local/bin/
# tar -xzf oc-mirror.tar.gz -C /usr/local/bin/
# chmod +x /usr/local/bin/oc-mirror
install -m 755 /data/ocp4/clients/butane-amd64 /usr/local/bin/butane
install -m 755 /data/ocp4/clients/coreos-installer_amd64 /usr/local/bin/coreos-installer
popd

# create a user and create the cluster under the user

useradd -m 3node

su - 3node

ssh-keygen

cat << EOF > ~/.ssh/config
StrictHostKeyChecking no
UserKnownHostsFile=/dev/null
EOF

chmod 600 ~/.ssh/config

cat << 'EOF' >> ~/.bashrc

export BASE_DIR='/home/3node/'

EOF

# export BASE_DIR='/home/3node/'

export BUILDNUMBER=4.12.9

mkdir -p ${BASE_DIR}/data/{sno/disconnected,install}

# set some parameter of you rcluster

NODE_SSH_KEY="$(cat ${BASE_DIR}/.ssh/id_rsa.pub)"
INSTALL_IMAGE_REGISTRY=quaylab.infra.wzhlab.top:5443

# PULL_SECRET='{"auths":{"registry.redhat.io": {"auth": "ZHVtbXk6ZHVtbXk=","email": "noemail@localhost"},"registry.ocp4.redhat.ren:5443": {"auth": "ZHVtbXk6ZHVtbXk=","email": "noemail@localhost"},"'${INSTALL_IMAGE_REGISTRY}'": {"auth": "'$( echo -n 'admin:shadowman' | openssl base64 )'","email": "noemail@localhost"}}}'
PULL_SECRET=$(cat /data/pull-secret.json)

NTP_SERVER=192.168.77.11
# HELP_SERVER=192.168.7.11
# KVM_HOST=192.168.7.11
# API_VIP=192.168.77.99
# INGRESS_VIP=192.168.77.98
# CLUSTER_PROVISION_IP=192.168.7.103
# BOOTSTRAP_IP=192.168.7.12
MACHINE_NETWORK='192.168.77.0/24'

# 定义单节点集群的节点信息
SNO_CLUSTER_NAME=osp-demo
SNO_BASE_DOMAIN=wzhlab.top

BOOTSTRAP_IP=192.168.77.42
MASTER_01_IP=192.168.77.43
MASTER_02_IP=192.168.77.44
MASTER_03_IP=192.168.77.45

BOOTSTRAP_IPv6=fd03::42
MASTER_01_IPv6=fd03::43
MASTER_02_IPv6=fd03::44
MASTER_03_IPv6=fd03::45

BOOTSTRAP_HOSTNAME=bootstrap-demo
MASTER_01_HOSTNAME=master-01-demo
MASTER_02_HOSTNAME=master-02-demo
MASTER_03_HOSTNAME=master-03-demo

BOOTSTRAP_INTERFACE=enp1s0
MASTER_01_INTERFACE=enp1s0
MASTER_02_INTERFACE=enp1s0
MASTER_03_INTERFACE=enp1s0

MASTER_01_INTERFACE_MAC=52:54:00:12:A1:01
MASTER_02_INTERFACE_MAC=52:54:00:12:A1:02
MASTER_03_INTERFACE_MAC=52:54:00:12:A1:03

BOOTSTRAP_DISK=/dev/vda
MASTER_01_DISK=/dev/vda
MASTER_02_DISK=/dev/vda
MASTER_03_DISK=/dev/vda

OCP_GW=192.168.77.11
OCP_NETMASK=255.255.255.0
OCP_NETMASK_S=24
OCP_DNS=192.168.77.11

OCP_GW_v6=fd03::11
OCP_NETMASK_v6=64

# echo ${SNO_IF_MAC} > /data/sno/sno.mac

mkdir -p ${BASE_DIR}/data/install
cd ${BASE_DIR}/data/install

/bin/rm -rf *.ign .openshift_install_state.json auth bootstrap manifests master*[0-9] worker*[0-9] *

cat << EOF > ${BASE_DIR}/data/install/install-config.yaml 
apiVersion: v1
baseDomain: $SNO_BASE_DOMAIN
compute:
- name: worker
  replicas: 0 
controlPlane:
  name: master
  replicas: 1
metadata:
  name: $SNO_CLUSTER_NAME
networking:
  # OVNKubernetes , OpenShiftSDN
  clusterNetwork:
    - cidr: 172.21.0.0/16
      hostPrefix: 23
    # - cidr: fd02::/48
    #   hostPrefix: 64
  machineNetwork:
    - cidr: $MACHINE_NETWORK
    # - cidr: 2001:DB8::/32
  serviceNetwork:
    - 172.22.0.0/16
    # - fd03::/112
platform: 
  none: {}
pullSecret: '${PULL_SECRET}'
sshKey: |
$( cat ${BASE_DIR}/.ssh/id_rsa.pub | sed 's/^/   /g' )
additionalTrustBundle: |
$( cat /etc/crts/redhat.ren.ca.crt | sed 's/^/   /g' )
imageContentSources:
- mirrors:
  - ${INSTALL_IMAGE_REGISTRY}/ocp4/openshift4
  source: quay.io/openshift-release-dev/ocp-release
- mirrors:
  - ${INSTALL_IMAGE_REGISTRY}/ocp4/openshift4
  source: quay.io/openshift-release-dev/ocp-v4.0-art-dev
EOF

cat << EOF > ${BASE_DIR}/data/install/agent-config.yaml
apiVersion: v1alpha1
kind: AgentConfig
metadata:
  name: $SNO_CLUSTER_NAME
rendezvousIP: $MASTER_01_IP
additionalNTPSources:
- $NTP_SERVER
hosts:
  - hostname: $MASTER_01_HOSTNAME
    role: master
    rootDeviceHints:
      deviceName: "$MASTER_01_DISK"
    interfaces:
      - name: $MASTER_01_INTERFACE
        macAddress: $MASTER_01_INTERFACE_MAC
    networkConfig:
      interfaces:
        - name: $MASTER_01_INTERFACE
          type: ethernet
          state: up
          mac-address: $MASTER_01_INTERFACE_MAC
          ipv4:
            enabled: true
            address:
              - ip: $MASTER_01_IP
                prefix-length: $OCP_NETMASK_S
            dhcp: false
      dns-resolver:
        config:
          server:
            - $OCP_DNS
      routes:
        config:
          - destination: 0.0.0.0/0
            next-hop-address: $OCP_GW
            next-hop-interface: $MASTER_01_INTERFACE
            table-id: 254        
EOF

/bin/cp -f ${BASE_DIR}/data/install/install-config.yaml ${BASE_DIR}/data/install/install-config.yaml.bak

openshift-install --dir=${BASE_DIR}/data/install agent create cluster-manifests

sudo bash -c "/bin/cp -f mirror/registries.conf /etc/containers/registries.conf.d/; chmod +r /etc/containers/registries.conf.d/*"

# /bin/cp -f  /data/ocp4/ansible-helper/files/* ${BASE_DIR}/data/install/openshift/

sudo bash -c "cd /data/ocp4 ; bash image.registries.conf.sh quaylab.infra.wzhlab.top:5443 ;"

/bin/cp -f /data/ocp4/99-worker-container-registries.yaml ${BASE_DIR}/data/install/openshift
/bin/cp -f /data/ocp4/99-master-container-registries.yaml ${BASE_DIR}/data/install/openshift

cd ${BASE_DIR}/data/install/

# openshift-install --dir=${BASE_DIR}/data/install create ignition-configs 

mkdir -p ~/.cache/agent/image_cache/
/bin/cp -f /data/ocp-$BUILDNUMBER/rhcos-live.x86_64.iso ~/.cache/agent/image_cache/coreos-x86_64.iso

openshift-install --dir=${BASE_DIR}/data/install agent create image --log-level=debug
# ......
# DEBUG Fetching image from OCP release (oc adm release info --image-for=machine-os-images --insecure=true --icsp-file=/tmp/icsp-file3636774741 quay.io/openshift-release-dev/ocp-release@sha256:96bf74ce789ccb22391deea98e0c5050c41b67cc17defbb38089d32226dba0b8)
# DEBUG The file was found in cache: /home/3node/.cache/agent/image_cache/coreos-x86_64.iso
# INFO Verifying cached file
# DEBUG extracting /coreos/coreos-x86_64.iso.sha256 to /tmp/cache1876698393, oc image extract --path /coreos/coreos-x86_64.iso.sha256:/tmp/cache1876698393 --confirm --icsp-file=/tmp/icsp-file455852761 quay.io/openshift-release-dev/ocp-v4.0-art-dev@sha256:052130abddf741195b6753888cf8a00757dedeb7010f7d4dcc4b842b5bc705f6
# ......

coreos-installer iso ignition show agent.x86_64.iso > ignition.ign

# HTTP_PATH=http://192.168.7.11:8080/ignition

source /data/ocp4/acm.fn.sh

# 我们会创建一个wzh用户,密码是redhat,这个可以在第一次启动的是,从console/ssh直接用用户名口令登录
# 方便排错和研究
VAR_PWD_HASH="$(python3 -c 'import crypt,getpass; print(crypt.crypt("redhat"))')"

cat ${BASE_DIR}/data/install/ignition.ign \
  | jq --arg VAR "$VAR_PWD_HASH" --arg VAR_SSH "$NODE_SSH_KEY" '.passwd.users += [{ "name": "wzh", "system": true, "passwordHash": $VAR , "sshAuthorizedKeys": [ $VAR_SSH ], "groups": [ "adm", "wheel", "sudo", "systemd-journal"  ] }]' \
  | jq '. += { "kernel_arguments" : { "should_exist" : [ "systemd.debug-shell=1" ] } }' \
  | jq -c . \
  > ${BASE_DIR}/data/install/ignition-iso.ign

coreos-installer iso ignition embed -f -i ignition-iso.ign agent.x86_64.iso

# VAR_IMAGE_VER=rhcos-410.86.202303200936-AnolisOS-0-live.x86_64.iso


on kvm host ( 103 )

cleanup


create_lv() {
    var_vg=$1
    var_pool=$2
    var_lv=$3
    var_size=$4
    var_action=$5
    lvremove -f $var_vg/$var_lv
    # lvcreate -y -L $var_size -n $var_lv $var_vg
    if [ "$var_action" == "recreate" ]; then
      lvcreate --type thin -n $var_lv -V $var_size --thinpool $var_vg/$var_pool
      wipefs --all --force /dev/$var_vg/$var_lv
    fi
}

virsh destroy ocp4-acm-one-bootstrap
virsh undefine ocp4-acm-one-bootstrap

create_lv vgdata poolA lvacm-one-bootstrap 500G 
create_lv vgdata poolA lvacm-one-bootstrap-data 500G 

virsh destroy ocp4-acm-one-master-01
virsh undefine ocp4-acm-one-master-01

create_lv vgdata poolA lvacm-one-master-01 500G 
create_lv vgdata poolA lvacm-one-master-01-data 500G 

virsh destroy ocp4-acm-one-master-02
virsh undefine ocp4-acm-one-master-02

create_lv vgdata poolA lvacm-one-master-02 500G 
create_lv vgdata poolA lvacm-one-master-02-data 500G 

virsh destroy ocp4-acm-one-master-03
virsh undefine ocp4-acm-one-master-03

create_lv vgdata poolA lvacm-one-master-03 500G 
create_lv vgdata poolA lvacm-one-master-03-data 500G 

begin


cat << EOF >> /etc/sysctl.d/99-wzh-sysctl.conf

vm.overcommit_memory = 1

EOF
sysctl --system

# 创建实验用虚拟网络

mkdir -p /data/kvm
cd /data/kvm

cat << 'EOF' > /data/kvm/bridge.sh
#!/usr/bin/env bash

PUB_CONN='eno1'
PUB_IP='172.21.6.103/24'
PUB_GW='172.21.6.254'
PUB_DNS='172.21.1.1'

nmcli con down "$PUB_CONN"
nmcli con delete "$PUB_CONN"
nmcli con down baremetal
nmcli con delete baremetal
# RHEL 8.1 appends the word "System" in front of the connection,delete in case it exists
nmcli con down "System $PUB_CONN"
nmcli con delete "System $PUB_CONN"
nmcli connection add ifname baremetal type bridge con-name baremetal ipv4.method 'manual' \
    ipv4.address "$PUB_IP" \
    ipv4.gateway "$PUB_GW" \
    ipv4.dns "$PUB_DNS"
    
nmcli con add type bridge-slave ifname "$PUB_CONN" master baremetal
nmcli con down "$PUB_CONN";pkill dhclient;dhclient baremetal
nmcli con up baremetal
EOF
bash /data/kvm/bridge.sh

nmcli con mod baremetal +ipv4.addresses "192.168.7.103/24"
nmcli con up baremetal

cat << EOF > /root/.ssh/config
StrictHostKeyChecking no
UserKnownHostsFile=/dev/null
EOF

pvcreate -y /dev/vdb
vgcreate vgdate /dev/vdb

# https://access.redhat.com/articles/766133
lvcreate -y -n poolA -L 500G vgdata
lvcreate -y -n poolA_meta -L 10G vgdata
lvconvert -y --thinpool vgdata/poolA --poolmetadata vgdata/poolA_meta

lvextend -l +100%FREE vgdata/poolA

mkdir -p /data/kvm/one/

scp root@192.168.77.11:/home/3node/data/install/agent.x86_64.iso /data/kvm/one/

create_lv() {
    var_vg=$1
    var_pool=$2
    var_lv=$3
    var_size=$4
    var_action=$5
    lvremove -f $var_vg/$var_lv
    # lvcreate -y -L $var_size -n $var_lv $var_vg
    if [ "$var_action" == "recreate" ]; then
      lvcreate --type thin -n $var_lv -V $var_size --thinpool $var_vg/$var_pool
      wipefs --all --force /dev/$var_vg/$var_lv
    fi
}


SNO_MEM=64

virsh destroy ocp4-acm-one-master-01
virsh undefine ocp4-acm-one-master-01

create_lv vgdata poolA lvacm-one-master-01 500G recreate
create_lv vgdata poolA lvacm-one-master-01-data 500G recreate

virt-install --name=ocp4-acm-one-master-01 --vcpus=16 --ram=$(($SNO_MEM*1024)) \
  --cpu=host-model \
  --disk path=/dev/vgdata/lvacm-one-master-01,device=disk,bus=virtio,format=raw \
  --disk path=/dev/vgdata/lvacm-one-master-01-data,device=disk,bus=virtio,format=raw \
  --os-variant rhel8.3 --network bridge=baremetal,model=virtio,mac=52:54:00:12:A1:01 \
  --graphics vnc,port=59003 --noautoconsole \
  --boot menu=on --cdrom /data/kvm/one/agent.x86_64.iso


on helper to see result

for unkonwn reason, the vm will be shutdown, instead of reboot, you have to poweron it manually.

cd ${BASE_DIR}/data/install
export KUBECONFIG=${BASE_DIR}/data/install/auth/kubeconfig
echo "export KUBECONFIG=${BASE_DIR}/data/install/auth/kubeconfig" >> ~/.bashrc
# oc completion bash | sudo tee /etc/bash_completion.d/openshift > /dev/null


cd ${BASE_DIR}/data/install
openshift-install --dir=${BASE_DIR}/data/install agent wait-for bootstrap-complete --log-level=debug
# ......
# DEBUG RendezvousIP from the AgentConfig 192.168.77.43
# INFO Bootstrap Kube API Initialized
# INFO Bootstrap configMap status is complete
# INFO cluster bootstrap is complete

cd ${BASE_DIR}/data/install
openshift-install --dir=${BASE_DIR}/data/install agent wait-for install-complete --log-level=debug
# ......
# INFO Install complete!
# INFO To access the cluster as the system:admin user when using 'oc', run
# INFO     export KUBECONFIG=/home/3node/data/install/auth/kubeconfig
# INFO Access the OpenShift web-console here: https://console-openshift-console.apps.osp-demo.wzhlab.top
# INFO Login to the console with user: "kubeadmin", and password: "UmfI2-99uAb-BRdaS-LLjQ9"

password login and oc config


# init setting for helper node
cat << EOF > ~/.ssh/config
StrictHostKeyChecking no
UserKnownHostsFile=/dev/null
EOF
chmod 600 ~/.ssh/config

# ssh core@*****

# sudo -i

# # change password for root
# echo 'redhat' | passwd --stdin root

# sed -i "s|^PasswordAuthentication no$|PasswordAuthentication yes|g" /etc/ssh/sshd_config
# sed -i "s|^PermitRootLogin no$|PermitRootLogin yes|g" /etc/ssh/sshd_config
# sed -i "s|^#ClientAliveInterval 180$|ClientAliveInterval 1800|g" /etc/ssh/sshd_config

# systemctl restart sshd

# # set env, so oc can be used
# cat << EOF >> ~/.bashrc

# export KUBECONFIG=/etc/kubernetes/static-pod-resources/kube-apiserver-certs/secrets/node-kubeconfigs/localhost.kubeconfig

# RET=`oc config use-context system:admin`

# EOF

cat > ${BASE_DIR}/data/install/crack.txt << EOF

echo redhat | sudo passwd --stdin root

sudo sed -i "s|^PasswordAuthentication no$|PasswordAuthentication yes|g" /etc/ssh/sshd_config
sudo sed -i "s|^PermitRootLogin no$|PermitRootLogin yes|g" /etc/ssh/sshd_config
sudo sed -i "s|^#ClientAliveInterval 180$|ClientAliveInterval 1800|g" /etc/ssh/sshd_config

sudo systemctl restart sshd

sudo sh -c 'echo "export KUBECONFIG=/etc/kubernetes/static-pod-resources/kube-apiserver-certs/secrets/node-kubeconfigs/localhost.kubeconfig" >> /root/.bashrc'

sudo sh -c 'echo "RET=\\\`oc config use-context system:admin\\\`" >> /root/.bashrc'

EOF

for i in 23 24 25
do
  ssh core@192.168.7.$i < ${BASE_DIR}/data/install/crack.txt
done

from other host

# https://unix.stackexchange.com/questions/230084/send-the-password-through-stdin-in-ssh-copy-id
dnf install -y sshpass

for i in 23 24 25
do
  sshpass -p 'redhat' ssh-copy-id root@192.168.7.$i
done

poweroff


for i in 23 24 25
do
  ssh root@192.168.7.$i poweroff
done

poweron


virsh start ocp4-acm-one-master-01

virsh start ocp4-acm-one-master-02

virsh start ocp4-acm-one-master-03

back and merge kubeconfig


mkdir -p ~/.kube/bak/

var_date=$(date '+%Y-%m-%d-%H%M')

/bin/cp -f /data/install/auth/kubeconfig ~/.kube/bak/kubeconfig-$var_date
/bin/cp -f /data/install/auth/kubeadmin-password ~/.kube/bak/kubeadmin-password-$var_date

sed "s/admin/admin\/$SNO_CLUSTER_NAME/g" /data/install/auth/kubeconfig > /tmp/config.new

# https://medium.com/@jacobtomlinson/how-to-merge-kubernetes-kubectl-config-files-737b61bd517d
/bin/cp -f ~/.kube/config ~/.kube/config.bak && KUBECONFIG=~/.kube/config:/tmp/config.new kubectl config view --flatten > /tmp/config && /bin/mv -f /tmp/config ~/.kube/config

unset KUBECONFIG

add worker node

我们装好了single node,那么接下来,我们还可以给这个single node添加worker节点,让这个single node cluster变成一个单master的集群。


# first, lets stick ingress to master
oc label node acm-demo-hub-master  ocp-ingress-run="true"

oc patch ingresscontroller default -n openshift-ingress-operator --type=merge --patch='{"spec":{"nodePlacement":{"nodeSelector": {"matchLabels":{"ocp-ingress-run":"true"}}}}}'

# we are testing env, so we don't need ingress replicas.
oc patch --namespace=openshift-ingress-operator --patch='{"spec": {"replicas": 1}}' --type=merge ingresscontroller/default

oc get -n openshift-ingress-operator ingresscontroller/default -o yaml

# then we get worker's ignition file, and start worker node, add it to cluster

oc extract -n openshift-machine-api secret/worker-user-data --keys=userData --to=- > /var/www/html/ignition/sno-worker.ign


HELP_SERVER=192.168.7.11

# 定义单节点集群的节点信息
SNO_IP=192.168.7.16
SNO_GW=192.168.7.11
SNO_NETMAST=255.255.255.0
SNO_HOSTNAME=acm-demo-hub-worker-01
SNO_IF=enp1s0
SNO_DNS=192.168.7.11
SNO_DISK=/dev/vda
SNO_MEM=16

BOOT_ARG=" ip=$SNO_IP::$SNO_GW:$SNO_NETMAST:$SNO_HOSTNAME:$SNO_IF:none nameserver=$SNO_DNS coreos.inst.install_dev=${SNO_DISK##*/} coreos.inst.ignition_url=http://$HELP_SERVER:8080/ignition/sno-worker.ign"

/bin/cp -f /data/ocp4/rhcos-live.x86_64.iso sno.iso

coreos-installer iso kargs modify -a "$BOOT_ARG" sno.iso

# go to kvm host ( 103 )
scp root@192.168.7.11:/data/install/sno.iso /data/kvm/

virsh destroy ocp4-acm-hub-worker01
virsh undefine ocp4-acm-hub-worker01

create_lv() {
    var_vg=$1
    var_pool=$2
    var_lv=$3
    var_size=$4
    var_action=$5
    lvremove -f $var_vg/$var_lv
    # lvcreate -y -L $var_size -n $var_lv $var_vg
    if [ "$var_action" == "recreate" ]; then
      lvcreate --type thin -n $var_lv -V $var_size --thinpool $var_vg/$var_pool
      wipefs --all --force /dev/$var_vg/$var_lv
    fi
}

create_lv vgdata poolA lvacmhub-worker01 500G recreate
# create_lv vgdata poolA lvacmhub-worker01-data 500G remove

virt-install --name=ocp4-acm-hub-worker01 --vcpus=16 --ram=$(($SNO_MEM*1024)) \
  --cpu=host-model \
  --disk path=/dev/vgdata/lvacmhub-worker01,device=disk,bus=virtio,format=raw \
  `# --disk path=/dev/vgdata/lvacmhub-data,device=disk,bus=virtio,format=raw` \
  --os-variant rhel8.3 --network bridge=baremetal,model=virtio \
  --graphics vnc,port=59003 \
  --boot menu=on --cdrom /data/kvm/sno.iso 

# after 2 boot up,
# go back to helper
oc get csr
oc get csr -ojson | jq -r '.items[] | select(.status == {} ) | .metadata.name' | xargs oc adm certificate approve

end