From 47c289d7a81ac4d1cde6275ed90cd130bd555553 Mon Sep 17 00:00:00 2001 From: siddharthvipul Date: Jun 23 2021 17:05:41 +0000 Subject: add OCP4 docs Signed-off-by: siddharthvipul --- diff --git a/docs/operations/ci/adding-jumphost.md b/docs/operations/ci/adding-jumphost.md new file mode 100644 index 0000000..8222b0c --- /dev/null +++ b/docs/operations/ci/adding-jumphost.md @@ -0,0 +1,17 @@ +# Add user to jumphost + +Jumphost users live in inventory file `inventory/ci-ssh-jumphosts` + +* make an entry of user in inventory in the following format +``` +login_name: billgates +full_name: "Bill Gates | Microsoft loves linux" +ssh_pub_keys: + - "- - - - " +``` +* Ensure all the lastest commits in playbook directory and inventory are pulled locally. +`git pull inventory/ && git pull playbooks` +* Use playbook baseline-playbook to envoke role role-baseline. we will limit + the run to just the ci ssh jumphost. +`ansible-playbook playbooks/role-baseline.yml --limit ci-ssh-jumphosts` +* Update remote with latest changes diff --git a/docs/operations/ci/adding_admin_users.md b/docs/operations/ci/adding_admin_users.md new file mode 100644 index 0000000..6a596e3 --- /dev/null +++ b/docs/operations/ci/adding_admin_users.md @@ -0,0 +1,49 @@ +# Adding users to the cluster admin group +To add cluster admin privileges to a particular user do the following. + +When authenticating to the Openshift cluster via ACO, it will automatically create a User object within Openshift. eg: + +``` +kind: User +apiVersion: user.openshift.io/v1 +metadata: + name: email@address.com +... +``` + +Created a Group ocp-ci-admins, and added the following users. Each "user" corresponds with the metadata, name for the corresponding User object. + +``` +kind: Group +apiVersion: user.openshift.io/v1 +metadata: + name: ocp-ci-admins + selfLink: /apis/user.openshift.io/v1/groups/ocp-ci-admins + uid: 24a5ad4d-7ee0-4e30-8f92-4b398ba5d389 + resourceVersion: '6800501' + creationTimestamp: '2020-05-27T16:03:26Z' +users: + - email@address.com +``` + +Added a ClusterRoleBinding, to bind our Group ocp-ci-admins to the ClusterRole cluster-admin + +``` +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: ocp-ci-cluster-admins + selfLink: /apis/rbac.authorization.k8s.io/v1/clusterrolebindings/ocp-ci-cluster-admins + uid: 7979a53b-6597-4ec7-9d6c-53b5ab8004c7 + resourceVersion: '6799178' + creationTimestamp: '2020-05-27T16:03:58Z' +subjects: + - kind: Group + apiGroup: rbac.authorization.k8s.io + name: ocp-ci-admins +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cluster-admin +``` + diff --git a/docs/operations/ci/adding_cico_tenant/adding-duffy-api-key.md b/docs/operations/ci/adding_cico_tenant/adding-duffy-api-key.md new file mode 100644 index 0000000..7bf716e --- /dev/null +++ b/docs/operations/ci/adding_cico_tenant/adding-duffy-api-key.md @@ -0,0 +1,106 @@ +# SOP to Create a duffy API/SSH keys +This SOP covers the process of creating an API key for duffy, and adding it to the duffy database table + + +## Requirements + +- project name + +## Duffy Database Schemas + +``` +MariaDB [duffy]> show tables; ++-----------------+ +| Tables_in_duffy | ++-----------------+ +| alembic_version | +| session_archive | +| session_hosts | +| sessions | +| stock | +| userkeys | +| users | ++-----------------+ +7 rows in set (0.00 sec) + +MariaDB [duffy]> describe stock; ++--------------+--------------+------+-----+---------+-------+ +| Field | Type | Null | Key | Default | Extra | ++--------------+--------------+------+-----+---------+-------+ +| id | int(11) | NO | PRI | NULL | | +| hostname | varchar(20) | YES | | NULL | | +| ip | varchar(15) | YES | | NULL | | +| chassis | varchar(20) | YES | | NULL | | +| used_count | int(11) | YES | | NULL | | +| state | varchar(20) | YES | | NULL | | +| comment | varchar(255) | YES | | NULL | | +| distro | varchar(20) | YES | | NULL | | +| rel | varchar(10) | YES | | NULL | | +| ver | varchar(10) | YES | | NULL | | +| arch | varchar(10) | YES | | NULL | | +| pool | int(11) | YES | | NULL | | +| console_port | int(11) | YES | | NULL | | +| flavor | varchar(20) | YES | | NULL | | +| session_id | varchar(37) | YES | MUL | NULL | | +| next_state | varchar(20) | YES | | NULL | | ++--------------+--------------+------+-----+---------+-------+ +16 rows in set (0.01 sec) + +MariaDB [duffy]> describe users; ++-------------+-------------+------+-----+---------+-------+ +| Field | Type | Null | Key | Default | Extra | ++-------------+-------------+------+-----+---------+-------+ +| apikey | varchar(37) | NO | PRI | | | +| projectname | varchar(50) | YES | | NULL | | +| jobname | varchar(50) | YES | | NULL | | +| createdat | date | YES | | NULL | | +| limitnodes | int(11) | YES | | NULL | | ++-------------+-------------+------+-----+---------+-------+ +5 rows in set (0.00 sec) + +MariaDB [duffy]> describe userkeys; ++------------+---------------+------+-----+---------+----------------+ +| Field | Type | Null | Key | Default | Extra | ++------------+---------------+------+-----+---------+----------------+ +| id | int(11) | NO | PRI | NULL | auto_increment | +| project_id | varchar(37) | YES | MUL | NULL | | +| key | varchar(8192) | YES | | NULL | | ++------------+---------------+------+-----+---------+----------------+ +3 rows in set (0.00 sec) + +MariaDB [duffy]> + +``` + + +``` ++-----------+----------------------+----------------------+------------+-------------+ +| apikey | projectname | jobname | createdat | limitnodes | ++-----------+----------------------+----------------------+------------+-------------+ +| xxxx-yyyy | nfs-ganesha | nfs-ganesha | 2016-02-24 | 10 | +| zzzz-aaaa | CentOS | centos_arrfab | 2015-04-17 | 10 | ++-----------+----------------------+----------------------+------------+-------------+ +``` + +## Steps to create a new duffy SSH key +1. On the home directory of user duffy on the admin.ci.centos.org instance, we have a folder where we store the created ssh keys for duffy tenants. +2. `mkdir -p keys/project-name/` then `ssh-keygen -f ~duffy/keys/project-name/id_rsa -C project-name@CI` +3. Copy the public key + +## Steps to create a new duffy API key + +1. How do we connect to instances + +The Duffy database runs on the admin.ci node: `ssh admin.ci.centos.org`. + +2. We have a script which does this work.. how do we use it + +3. Create user in usertable +`insert into users values(UUID(), 'projectname', 'projectname', NOW(), 5);` + +4. Retrieve the api key from the users table +` select * from users where projectname="projectname";` + +5. Using that api-key/UUID as project_id, enter ssh key of a user from the project so that they can ssh into the machines. This process must be repeated for every user we wish to add access to via SSH. +`insert into userkeys (`project_id`,`key`) values('', '');` +This ssh key is pushed to duffy nodes - authorized keys when a tenant requests the node through api key. diff --git a/docs/operations/ci/adding_nodes.md b/docs/operations/ci/adding_nodes.md new file mode 100644 index 0000000..e84e688 --- /dev/null +++ b/docs/operations/ci/adding_nodes.md @@ -0,0 +1,123 @@ +# Adding Compute/Worker nodes +This SOP should be used in the following scenario: + +- Red Hat OpenShift Container Platform 4.x cluster has been installed some time ago (1+ days ago) and additional worker nodes are required to increase the capacity for the cluster. + + +## Steps + +1. Add the new nodes being added to the cluster to the appropriate inventory file in the appropriate group. + +eg: + +``` +# ocp, compute/worker: +[ocp-ci-compute] +newnode1.example.centos.org +newnode2.example.centos.org +newnode3.example.centos.org +newnode4.example.centos.org +newnode5.example.centos.org +``` + +eg: + +``` +# ocp.stg, compute/worker: +[ocp-stg-ci-compute] +newnode6.example.centos.org +newnode7.example.centos.org +newnode8.example.centos.org +newnode9.example.centos.org + +# ocp.stg, master/control plane +[ocp-stg-ci-master] +newnode10.example.centos.org +``` + + +2. Examine the `inventory` file for `ocp` or `ocp.stg` and determine which management node corresponds with the group `ocp-ci-management`. + +eg: + +``` +[ocp-ci-management] +some-managementnode.example.centos.org +``` + +3. Find the OCP admin user which is contained in the hostvars for this management node at the key `ocp_service_account`. + +eg: + +``` +host_vars/some-managementnode.example.centos.org:ocp_service_account: adminuser +``` + +4. SSH to the node identified in step `2`, and become the user identified in step `3`. + +eg: + +``` +ssh some-managementnode.example.centos.org + +sudo su - adminuser +``` + +5. Verify that you are authenticated correctly to the Openshift cluster as the `system:admin`. + +``` +oc whoami +system:admin +``` + +6. Retrieve the certificate from the internal API and convert the contents to base64 string like so. + +eg: + +``` +echo "q" | openssl s_client -connect api-int.ocp.ci.centos.org:22623 -showcerts | awk '/-----BEGIN CERTIFICATE-----/,/-----END CERTIFICATE-----/' | base64 --wrap=0 +DONE +XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXCERTSTOREDASABASE64ENCODEDSTRING= +``` + +7. Replace the cert in the compute/worker ignition file, at the `XXXXXXXXREPLACEMEXXXXXXXX=` point, be sure to save this change in SCM, and push. + +``` +cat filestore/rhcos/compute.ign +{"ignition":{"config":{"append":[{"source":"https://api-int.ocp.ci.centos.org:22623/config/worker","verification":{}}]},"security":{"tls":{"certificateAuthorities":[{"source":"data:text/plain;charset=utf-8;base64,XXXXXXXXREPLACEMEXXXXXXXX=","verification":{}}]}},"timeouts":{},"version":"2.2.0"},"networkd":{},"passwd":{},"storage":{"disks":[{"device":"/dev/sdb","wipeTable":true}]},"systemd":{}} +``` + +8. Once the ignition file has been updated, run the `adhoc-provision-ocp4-node` playbook to copy the updated ignition files up to the http server, and install the new node(s). When prompted, specify the hostname of the new node. Best to do one at a time, it takes a minute or two per new node being added at this step. + +eg: + +``` +ansible-playbook playbooks/adhoc-provision-ocp4-node.yml +[WARNING] Nodes to be fully wiped/reinstalled with OCP => : newnode6.example.centos.org +``` + +9. As the new nodes are provisioned, they will attempt to join the cluster. They must first be accepted. + +``` +# List the certs. If you see status pending, this is the worker/compute nodes attempting to join the cluster. It must be approved. +oc get csr + +# Accept all node CSRs one liner +oc get csr -o go-template='{{range .items}}{{if not .status}}{{.metadata.name}}{{"\n"}}{{end}}{{end}}' | xargs oc adm certificate approve +``` + + +10. Finally run the playbook to update haproxy config to monitor the new nodes. + +``` +ansible-playbook playbooks/role-haproxy.yml --tags="config" +``` + + +To see more information about adding new worker/compute nodes to a user provisioned infrastructure based OCP4 cluster see the detailed steps at [1],[2]. + + +### Resources + +- [1] [How to add Openshift 4 RHCOS worker nodes in UPI <24 hours](https://access.redhat.com/solutions/4246261) +- [2] [How to add Openshift 4 RHCOS worker nodes to UPI >24 hours](https://access.redhat.com/solutions/4799921) diff --git a/docs/operations/ci/adding_oidc_authentication.md b/docs/operations/ci/adding_oidc_authentication.md new file mode 100644 index 0000000..17df7fa --- /dev/null +++ b/docs/operations/ci/adding_oidc_authentication.md @@ -0,0 +1,63 @@ +# Adding OIDC Authentication +In CentOS, we have an instance of Ipsilon[1] which we currently use to authenticate many of our services. + + +### Steps +This SOP covers configuring ocp.ci/ocp.stg.ci with an OpenID identity provider which is used to communicate with our ACO Ipsilon instance and provide authentication to the cluster. + +- Authenticate with the ocp.ci/ocp.stg.ci cluster via the cli +- Create an Openshift Secret containing the ACO/Ipsilon clientSecret +- Create an Openshift Oauth object with the identityProvider configuration + + +See below for sample template which achieves this. + + +``` +apiVersion: template.openshift.io/v1 +kind: Template +metadata: + name: openshift-oidc-config +objects: +- kind: Secret + apiVersion: v1 + metadata: + name: openid-client-secret-ocp-ci + namespace: openshift-config + data: + clientSecret: + type: Opaque +- apiVersion: config.openshift.io/v1 + kind: OAuth + metadata: + name: cluster + spec: + identityProviders: + - mappingMethod: claim + name: accounts-centos-org + openID: + claims: + email: + - email + - custom_email_claim + name: + - name + - nickname + - given_name + preferredUsername: + - email + clientID: ocp.ci.centos + clientSecret: + name: openid-client-secret-ocp-ci + extraScopes: + - email + - profile + issuer: 'https://id.centos.org/idp/openidc' + type: OpenID +``` + + + +### Resources: +- [1] [Ipsilon](https://ipsilon-project.org/) + diff --git a/docs/operations/ci/adding_privileged_scc_to_sa.md b/docs/operations/ci/adding_privileged_scc_to_sa.md new file mode 100644 index 0000000..1f69e46 --- /dev/null +++ b/docs/operations/ci/adding_privileged_scc_to_sa.md @@ -0,0 +1,19 @@ +# Adding Privileged SCC to Service Accounts +This SOP should be used in the following scenario: + +- A tenant has been approved to run `privileged container` workloads. + + +## Steps + +1. Add the `prvileged` security context constraint to the service account in the tenants namespace like so: + +``` +oc adm policy add-scc-to-user privileged -n namespace -z myserviceaccount +``` + + + +### Resources + +- [1] [How to add the privileged SCC to a service account](https://docs.openshift.com/container-platform/4.5/cli_reference/openshift_cli/administrator-cli-commands.html#policy) diff --git a/docs/operations/ci/adding_taints_to_nodes.md b/docs/operations/ci/adding_taints_to_nodes.md new file mode 100644 index 0000000..317c1aa --- /dev/null +++ b/docs/operations/ci/adding_taints_to_nodes.md @@ -0,0 +1,24 @@ +# Adding Taints to a nodes +A taint allows a Node to control which pods should or should not be scheduled on them. A toleration is something which can be applied to a pod, to indicate that it can tolerate a taint, and may mark it as being schedulable on a node with the matching taint. + +To view the official docs for Openshift/Kubernetes see [1]. This also provides information on some of the default taints which have special meaning in a Kubernetes environment. + +## Example taint +The following example `node.kubernetes.io/unschedulable` is an example of a special taint which can be applied to a Node configuration. Internal Openshift/Kubernetes systems have tolerations in place by default for. With this knowledge, we can use it to prevent user workloads from being scheduled, while leaving internal system workloads in place. The effect `PreferNoSchedule` applys the following logic: + +- New pods which dont have this taint will not get scheduled on a node with this taint +- Existing pods will be allowed to run + +For the full list of effects see the official documentation at [1]. + +``` +spec: + taints: + - key: node.kubernetes.io/unschedulable + effect: PreferNoSchedule +``` + + +### Resources + +- [1] [Controlling Pod Placement using Node Taints](https://docs.openshift.com/container-platform/4.5/nodes/scheduling/nodes-scheduler-taints-tolerations.html) diff --git a/docs/operations/ci/authenticating_via_cli.md b/docs/operations/ci/authenticating_via_cli.md new file mode 100644 index 0000000..32d2ebc --- /dev/null +++ b/docs/operations/ci/authenticating_via_cli.md @@ -0,0 +1,15 @@ +## Authenticating via CLI +Members of the CentOS CI Infrastructure team have admin access for ocp.ci and ocp.stg.ci Openshift clusters for their ACO accounts. + +To login via the CLI using your main account authenticated via ACO simply: + +- Authenticate via accounts-centos-org option in ocp.ci/ocp.stg.ci +- In the top right of the Openshift console, click the drop down menu for your user and click `Copy Login Command` +- Copy the `Log in with this token: oc login --token=xxxx --server=https://uri` and paste into your terminal + +To login via the CLI using the `system:admin` user simply: + +- ssh to the admin node which corresponds with `ocp-ci-management` or `ocp-ci-management-stg` inventory group +- Change user to the ocp admin user on the admin node choose appropriate version: `sudo su - ocpadm` or for staging `sudo su - ocpadmstg` +- `export /home//.kube/config` +- You should now have `system:admin` access to the cluster. diff --git a/docs/operations/ci/cleaning_legacy_jenkins_storage.md b/docs/operations/ci/cleaning_legacy_jenkins_storage.md new file mode 100644 index 0000000..5d31dcb --- /dev/null +++ b/docs/operations/ci/cleaning_legacy_jenkins_storage.md @@ -0,0 +1,35 @@ +# Cleaning jenkins storage +When recieving Zabbix alerts for low storage on the legacy Jenkins, we can prune old builds from some of the largest storage users on the cluster using this SOP. + + +## Step 1: Creating a token +* Firstly generate a jenkins token go to `https://ci.centos.org/user/username/configure` +* Create a token from API token section +* Set the username and token variables below + +``` +JENKINSUSER=username +JENKINSAPITOKEN=token +``` + + +## Step 2: Getting list of jobs +* ssh into `jenkins.ci.centos.org` +* Find the list of projects which are consuming most space by `du -csh /var/lib/jenkins/* | grep 'G' | sort -r` + + +## Getting crumb api +* Use curl to generate a Crumb token + +``` +CRUMB=$(curl 'https://$JENKINSUSER:$JENKINSAPITOKEN@ci.centos.org/crumbIssuer/api/xml?xpath=concat(//crumbRequestField,":",//crumb)') +``` + + +## Deleting builds from job +* Now with the crumb token set, we can delete the jobs using the API. +* In the following example, update the `jobname` and `start range/ end range` values which correspond with the build numbers in the jobname: + +``` +curl -H "$CRUMB" -X POST "https://$JENKINSUSER:$JENKINSAPITOKEN@ci.centos.org/job//[-]/doDelete" +``` diff --git a/docs/operations/ci/configuring_default_aco_user_permissions.md b/docs/operations/ci/configuring_default_aco_user_permissions.md new file mode 100644 index 0000000..0431369 --- /dev/null +++ b/docs/operations/ci/configuring_default_aco_user_permissions.md @@ -0,0 +1,66 @@ +## Configure default permission for ACO users +By default, all users which are authenticated with Openshift (system:authenticated) will be apart of the group `self-provisioners`. This role provides the basic access to create projects etc, where the user then has admin access within. + +To prevent this, we must first delete this `self-provisioner` ClusterRoleBinding. Should we ever wish to restore for whatever reason, see the following which is the original contents of the object: + +``` +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: self-provisioners + annotations: + rbac.authorization.kubernetes.io/autoupdate: 'true' +subjects: + - kind: Group + apiGroup: rbac.authorization.k8s.io + name: 'system:authenticated:oauth' +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: self-provisioner +``` + +Once removed, a new user which authenticates via ACO, now no longer has permission to do much of anything, beyond what a `basic-user` role provides. + +To find this role originally, see resources [1][2]. To list the cluster roles and their bindings do the following `oc describe clusterrole.rbac` and `oc describe clusterrolebinding.rbac`. Searching for `system:authenticated` pointed toward which role was being automatically applied to the users which were authenticated with the cluster. + +### Adding permissions to an authenticated user +We first create a group which will contain all the users for a particular proejct. eg: + +``` +kind: Group +apiVersion: user.openshift.io/v1 +metadata: + name: project-group-admins +users: + - user2 + - user1 +``` + +Then create a project/namespace for the project. eg: `oc create namespace "project"` + +Next create a rolebinding for the group to a role. We want to give members of this group, admin access within the namespace. eg: + +``` +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: project-admins + namespace: project +subjects: + - kind: Group + apiGroup: rbac.authorization.k8s.io + name: project-group-admins +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: admin +``` + +Users listed in the group will now have admin access to the project/namespace and nothing else within the cluster, which is what we want. + + +### Resources +- [1] Using RBAC to define and apply permissions https://docs.openshift.com/container-platform/4.4/authentication/using-rbac.html#default-roles_using-rbac +- [2] Using OIDC to authenticate https://docs.openshift.com/container-platform/4.4/authentication/identity_providers/configuring-oidc-identity-provider.html#configuring-oidc-identity-provider + diff --git a/docs/operations/ci/configuring_image_registry.md b/docs/operations/ci/configuring_image_registry.md new file mode 100644 index 0000000..86f1ea8 --- /dev/null +++ b/docs/operations/ci/configuring_image_registry.md @@ -0,0 +1,55 @@ +## Image Registry + +### Resources +- [1] https://docs.openshift.com/container-platform/4.4/registry/configuring_registry_storage/configuring-registry-storage-baremetal.html + + +### Prerequisites + +- Cluster administrator permissions. +- A cluster on bare metal. +- Provision persistent storage for your cluster, such as Red Hat OpenShift Container Storage. To deploy a private image registry, your storage must provide ReadWriteMany access mode. +- Must have "100Gi" capacity. + + + +To start the image registry, you must change ManagementState Image Registry Operator configuration from Removed to Managed. +Leave the claim field blank to allow the automatic creation of an image-registry-storage PVC. + + +``` +$ oc edit configs.imageregistry/cluster +apiVersion: imageregistry.operator.openshift.io/v1 +kind: Config +metadata: +... +spec: +... + managementState: Managed + storage: + pvc: + claim: +... +``` + + +We want to enable the image pruner, to occationally prune images in the registry. + +``` +$ oc edit imagepruner.imageregistry/cluster +apiVersion: imageregistry.operator.openshift.io/v1 +kind: ImagePruner +metadata: + name: cluster +spec: + suspend: false +... +``` + + +Check the status of the deployment: + +``` +oc get clusteroperator image-registry +``` + diff --git a/docs/operations/ci/cordoning_nodes_and_draining_pods.md b/docs/operations/ci/cordoning_nodes_and_draining_pods.md new file mode 100644 index 0000000..e0ab9a2 --- /dev/null +++ b/docs/operations/ci/cordoning_nodes_and_draining_pods.md @@ -0,0 +1,54 @@ +# Cordoning Nodes and Draining Pods +This SOP should be followed in the following scenarios: + +- If maintenance is scheduled to be carried out on an Openshift node. + + +## Steps + +1. Mark the node as unschedulable: + +``` +nodes=$(oc get nodes -o name | sed -E "s/node\///") +echo $nodes + +for node in ${nodes[@]}; do oc adm cordon $node; done +node/ cordoned +``` + +2. Check that the node status is `NotReady,SchedulingDisabled` + +``` +oc get node +NAME STATUS ROLES AGE VERSION + NotReady,SchedulingDisabled worker 1d v1.18.3 +``` + +Note: It might not switch to `NotReady` immediately, there maybe many pods still running. + + +3. Evacuate the Pods from **worker nodes** using one of the following methods +This will drain node ``, delete any local data, and ignore daemonsets, and give a period of 60 seconds for pods to drain gracefully. + +``` +oc adm drain --delete-local-data=true --ignore-daemonsets=true --grace-period=60 +``` + +4. Perform the scheduled maintenance on the node +Do what ever is required in the scheduled maintenance window + + +5. Once the node is ready to be added back into the cluster +We must uncordon the node. This allows it to be marked scheduleable once more. + +``` +nodes=$(oc get nodes -o name | sed -E "s/node\///") +echo $nodes + +for node in ${nodes[@]}; do oc adm uncordon $node; done +``` + + +### Resources + +- [1] [Nodes - working with nodes](https://docs.openshift.com/container-platform/4.5/nodes/nodes/nodes-nodes-working.html) diff --git a/docs/operations/ci/create_etcd_backup.md b/docs/operations/ci/create_etcd_backup.md new file mode 100644 index 0000000..d629b36 --- /dev/null +++ b/docs/operations/ci/create_etcd_backup.md @@ -0,0 +1,49 @@ +# Create etcd backup +This SOP should be followed in the following scenarios: + +- When the need exists to create an etcd backup. +- When shutting a cluster down gracefully. + +## Steps + +1. Connect to a master node + +``` +oc debug node/ +``` + +2. Chroot to the /host directory on the containers filesystem + +``` +sh-4.2# chroot /host +``` + +3. Run the cluster-backup.sh script and pass in the location to save the backup to + +``` +sh-4.4# /usr/local/bin/cluster-backup.sh /home/core/assets/backup +``` + +4. Chown the backup files to be owned by user `core` and group `core` + +``` +chown -R core:core /home/core/assets/backup +``` + +5. From the admin machine, see inventory group: `ocp-ci-management`, become the Openshift service account, see the inventory hostvars for the host identified in the previous step and note the `ocp_service_account` variable. + +``` +ssh +sudo su - +``` + +6. Copy the files down to the admin machine. + +``` +scp -i core@:/home/core/assets/backup/* ocp_backups/ +``` + + +### Resources + +- [1] [Creating an etcd backup](https://docs.openshift.com/container-platform/4.5/backup_and_restore/backing-up-etcd.html#backing-up-etcd-data_backup-etcd) diff --git a/docs/operations/ci/disabling_self_provisioner_role.md b/docs/operations/ci/disabling_self_provisioner_role.md new file mode 100644 index 0000000..dde0144 --- /dev/null +++ b/docs/operations/ci/disabling_self_provisioner_role.md @@ -0,0 +1,69 @@ +# Disabling self-provisioners role +By default, when a user authenticates with Openshift via Oauth, it is part of the `self-provisioners` group. This group provides the ability to create new projects. On CentOS CI we do not want users to be able to create their own projects, as we have a system in place where we create a project and control the administrators of that project. + +To disable the self-provisioner role do the following as outlined in the documentation[1]. + +``` +oc describe clusterrolebinding.rbac self-provisioners + +Name: self-provisioners +Labels: +Annotations: rbac.authorization.kubernetes.io/autoupdate=true +Role: + Kind: ClusterRole + Name: self-provisioner +Subjects: + Kind Name Namespace + ---- ---- --------- + Group system:authenticated:oauth +``` + +Remove the subjects that the self-provisioners role applies to. + +``` +oc patch clusterrolebinding.rbac self-provisioners -p '{"subjects": null}' +``` + +Verify the change occurred successfully + +``` +oc describe clusterrolebinding.rbac self-provisioners +Name: self-provisioners +Labels: +Annotations: rbac.authorization.kubernetes.io/autoupdate: true +Role: + Kind: ClusterRole + Name: self-provisioner +Subjects: + Kind Name Namespace + ---- ---- --------- +``` + +When the cluster is updated to a new version, unless we mark the role appropriately, the permissions will be restored after the update is complete. + +Verify that the value is currently set to be restored after an update: + +``` +oc get clusterrolebinding.rbac self-provisioners -o yaml +``` + +``` +apiVersion: authorization.openshift.io/v1 +kind: ClusterRoleBinding +metadata: + annotations: + rbac.authorization.kubernetes.io/autoupdate: "true" + ... +``` + +We wish to set this `rbac.authorization.kubernetes.io/autoupdate` to `false`. To patch this do the following. + +``` +oc patch clusterrolebinding.rbac self-provisioners -p '{ "metadata": { "annotations": { "rbac.authorization.kubernetes.io/autoupdate": "false" } } }' +``` + + + +### Resources + +- [1] https://docs.openshift.com/container-platform/4.4/applications/projects/configuring-project-creation.html#disabling-project-self-provisioning_configuring-project-creation diff --git a/docs/operations/ci/graceful_shutdown_ocp_cluster.md b/docs/operations/ci/graceful_shutdown_ocp_cluster.md new file mode 100644 index 0000000..dbd4fde --- /dev/null +++ b/docs/operations/ci/graceful_shutdown_ocp_cluster.md @@ -0,0 +1,29 @@ +# Graceful Shutdown of an Openshift 4 Cluster +This SOP should be followed in the following scenarios: + +- Shutting down an Openshift 4 cluster. + + +## Steps + +Prequisite steps: +- Follow the SOP for cordoning and draining the nodes. +- Follow the SOP for creating an `etcd` backup. + + +1. Get the nodes + +``` +nodes=$(oc get nodes -o name | sed -E "s/node\///") +``` + +2. Shutdown the nodes from the administration box associated with the cluster eg prod/staging. + +``` +for node in ${nodes[@]}; do ssh -i core@$node sudo shutdown -h now; done +``` + + +### Resources + +- [1] [Graceful Cluster Shutdown](https://docs.openshift.com/container-platform/4.5/backup_and_restore/graceful-cluster-shutdown.html) diff --git a/docs/operations/ci/graceful_startup_ocp_cluster.md b/docs/operations/ci/graceful_startup_ocp_cluster.md new file mode 100644 index 0000000..1e5711f --- /dev/null +++ b/docs/operations/ci/graceful_startup_ocp_cluster.md @@ -0,0 +1,71 @@ +# Graceful Startup of an Openshift 4 Cluster +This SOP should be followed in the following scenarios: + +- Starting up an Openshift 4 cluster. + + +## Steps + +Prequisite steps: + + +1. Start the physical nodes + +- Production uses `adhoc-openshift-nfs-stats.yaml` playbook to stop/start/restart nodes +- Staging uses seamicro accessible from admin machine, user manual contained in centosci/ocp4-docs/sops/seamicro + +2. Once the nodes have been started they must be uncordoned if appropriate + +``` +oc get nodes +NAME STATUS ROLES AGE VERSION +dumpty-n1.ci.centos.org Ready,SchedulingDisabled worker 77d v1.18.3+6c42de8 +dumpty-n2.ci.centos.org Ready,SchedulingDisabled worker 77d v1.18.3+6c42de8 +dumpty-n3.ci.centos.org Ready,SchedulingDisabled worker 77d v1.18.3+6c42de8 +dumpty-n4.ci.centos.org Ready,SchedulingDisabled worker 77d v1.18.3+6c42de8 +dumpty-n5.ci.centos.org Ready,SchedulingDisabled worker 77d v1.18.3+6c42de8 +kempty-n10.ci.centos.org Ready,SchedulingDisabled worker 106d v1.18.3+6c42de8 +kempty-n11.ci.centos.org Ready,SchedulingDisabled worker 106d v1.18.3+6c42de8 +kempty-n12.ci.centos.org Ready,SchedulingDisabled worker 106d v1.18.3+6c42de8 +kempty-n6.ci.centos.org Ready,SchedulingDisabled master 106d v1.18.3+6c42de8 +kempty-n7.ci.centos.org Ready,SchedulingDisabled master 106d v1.18.3+6c42de8 +kempty-n8.ci.centos.org Ready,SchedulingDisabled master 106d v1.18.3+6c42de8 +kempty-n9.ci.centos.org Ready,SchedulingDisabled worker 106d v1.18.3+6c42de8 + +nodes=$(oc get nodes -o name | sed -E "s/node\///") + +for node in ${nodes[@]}; do oc adm uncordon $node; done +node/dumpty-n1.ci.centos.org uncordoned +node/dumpty-n2.ci.centos.org uncordoned +node/dumpty-n3.ci.centos.org uncordoned +node/dumpty-n4.ci.centos.org uncordoned +node/dumpty-n5.ci.centos.org uncordoned +node/kempty-n10.ci.centos.org uncordoned +node/kempty-n11.ci.centos.org uncordoned +node/kempty-n12.ci.centos.org uncordoned +node/kempty-n6.ci.centos.org uncordoned +node/kempty-n7.ci.centos.org uncordoned +node/kempty-n8.ci.centos.org uncordoned +node/kempty-n9.ci.centos.org uncordoned + +oc get nodes +NAME STATUS ROLES AGE VERSION +dumpty-n1.ci.centos.org Ready worker 77d v1.18.3+6c42de8 +dumpty-n2.ci.centos.org Ready worker 77d v1.18.3+6c42de8 +dumpty-n3.ci.centos.org Ready worker 77d v1.18.3+6c42de8 +dumpty-n4.ci.centos.org Ready worker 77d v1.18.3+6c42de8 +dumpty-n5.ci.centos.org Ready worker 77d v1.18.3+6c42de8 +kempty-n10.ci.centos.org Ready worker 106d v1.18.3+6c42de8 +kempty-n11.ci.centos.org Ready worker 106d v1.18.3+6c42de8 +kempty-n12.ci.centos.org Ready worker 106d v1.18.3+6c42de8 +kempty-n6.ci.centos.org Ready master 106d v1.18.3+6c42de8 +kempty-n7.ci.centos.org Ready master 106d v1.18.3+6c42de8 +kempty-n8.ci.centos.org Ready master 106d v1.18.3+6c42de8 +kempty-n9.ci.centos.org Ready worker 106d v1.18.3+6c42de8 +``` + + +### Resources + +- [1] [Graceful Cluster Startup](https://docs.openshift.com/container-platform/4.5/backup_and_restore/graceful-cluster-restart.html) +- [2] [Cluster disaster recovery](https://docs.openshift.com/container-platform/4.5/backup_and_restore/disaster_recovery/scenario-2-restoring-cluster-state.html#dr-restoring-cluster-state) diff --git a/docs/operations/ci/index.md b/docs/operations/ci/index.md deleted file mode 100644 index e69de29..0000000 --- a/docs/operations/ci/index.md +++ /dev/null diff --git a/docs/operations/ci/ingress_controllers/README.md b/docs/operations/ci/ingress_controllers/README.md new file mode 100644 index 0000000..97667bd --- /dev/null +++ b/docs/operations/ci/ingress_controllers/README.md @@ -0,0 +1,60 @@ +# Spike: Investigate adding routes from apps.ci.centos.org +The Ingress Operator[1] manages `IngressController` resources which will allow us to achieve[3] this on Openshift 4. + +### Resources +- [1] https://docs.openshift.com/container-platform/4.4/networking/ingress-operator.html +- [2] https://rcarrata.com/openshift/ocp4_route_sharding/ +- [3] https://projects.engineering.redhat.com/browse/CPE-764 + + +### POC +Performed the following steps to achieve goal: + +``` +-rw-rw-r--. 1 dkirwan dkirwan 1060 Jul 6 18:12 deployment.yaml +-rw-rw-r--. 1 dkirwan dkirwan 286 Jul 6 17:13 ingresscontroller.yaml +-rw-rw-r--. 1 dkirwan dkirwan 336 Jul 6 17:53 route.yaml +-rw-rw-r--. 1 dkirwan dkirwan 273 Jul 6 17:58 service.yaml +``` +- Created an `IngressController` which creates 2 router replicas and configured to manage Routes which point at `*.apps.ci.centos.org`. It also has a `routeSelector` to match labels `type: sharded` +``` + routeSelector: + matchLabels: + type: sharded +``` +- Created a Deployment with simple app. +- Created Service and Route to expose the app externally at `ingress-controller-test.apps.ci.centos.org`. +- Route has been given a label: `type: sharded` +- Used `dig` to retrieve the public IP address of the cluster +``` +dig console-openshift-console.apps.ocp.stg.ci.centos.org + +; <<>> DiG 9.11.18-RedHat-9.11.18-1.fc32 <<>> console-openshift-console.apps.ocp.stg.ci.centos.org +;; global options: +cmd +;; Got answer: +;; ->>HEADER<<- opcode: QUERY, status: NOERROR, id: 21722 +;; flags: qr rd ra; QUERY: 1, ANSWER: 1, AUTHORITY: 0, ADDITIONAL: 1 + +;; OPT PSEUDOSECTION: +; EDNS: version: 0, flags:; udp: 4096 +;; QUESTION SECTION: +;console-openshift-console.apps.ocp.stg.ci.centos.org. IN A + +;; ANSWER SECTION: +console-openshift-console.apps.ocp.stg.ci.centos.org. 600 IN A 8.43.84.237 + +;; Query time: 77 msec +;; SERVER: 10.38.5.26#53(10.38.5.26) +;; WHEN: Mon Jul 06 18:43:35 IST 2020 +;; MSG SIZE rcvd: 97 +``` +- Configured my `/etc/hosts` file accordingly: +``` +cat /etc/hosts +127.0.0.1 localhost localhost.localdomain localhost4 localhost4.localdomain4 +::1 localhost localhost.localdomain localhost6 localhost6.localdomain6 +8.43.84.237 ingress-controller-test.apps.ci.centos.org +``` +- Visited `http://ingress-controller-test.apps.ci.centos.org` and was greeted with the expected content in the deployed app. +- We should be able to achieve this spike CPE-764 using the Ingress Controller Operator. + diff --git a/docs/operations/ci/ingress_controllers/deployment.yaml b/docs/operations/ci/ingress_controllers/deployment.yaml new file mode 100644 index 0000000..a4865b6 --- /dev/null +++ b/docs/operations/ci/ingress_controllers/deployment.yaml @@ -0,0 +1,41 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ingress-controller-deployment + namespace: ingress-controller-test +spec: + progressDeadlineSeconds: 600 + replicas: 1 + revisionHistoryLimit: 10 + selector: + matchLabels: + app: ingress-controller-test + strategy: + rollingUpdate: + maxSurge: 25% + maxUnavailable: 25% + type: RollingUpdate + template: + metadata: + annotations: + openshift.io/generated-by: OpenShiftWebConsole + creationTimestamp: null + labels: + app: ingress-controller-test + spec: + containers: + - image: quay.io/dkirwan_redhat/crypto_monitoring:v0.0.1 + imagePullPolicy: IfNotPresent + name: ingress-controller-test + ports: + - containerPort: 8080 + protocol: TCP + resources: {} + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + dnsPolicy: ClusterFirst + restartPolicy: Always + schedulerName: default-scheduler + securityContext: {} + terminationGracePeriodSeconds: 30 + diff --git a/docs/operations/ci/ingress_controllers/ingresscontroller.yaml b/docs/operations/ci/ingress_controllers/ingresscontroller.yaml new file mode 100644 index 0000000..3e3fb8f --- /dev/null +++ b/docs/operations/ci/ingress_controllers/ingresscontroller.yaml @@ -0,0 +1,14 @@ +apiVersion: operator.openshift.io/v1 +kind: IngressController +metadata: + name: cpe-764-spike + namespace: openshift-ingress-operator +spec: + domain: apps.ci.centos.org + endpointPublishingStrategy: + type: HostNetwork + routeSelector: + matchLabels: + type: sharded +status: {} + diff --git a/docs/operations/ci/ingress_controllers/route.yaml b/docs/operations/ci/ingress_controllers/route.yaml new file mode 100644 index 0000000..8d0b7dd --- /dev/null +++ b/docs/operations/ci/ingress_controllers/route.yaml @@ -0,0 +1,17 @@ +apiVersion: route.openshift.io/v1 +kind: Route +metadata: + creationTimestamp: null + labels: + type: sharded + name: test + namespace: ingress-controller-test +spec: + host: ingress-controller-test.apps.ci.centos.org + port: + targetPort: 8080-tcp + to: + kind: Service + name: test-service + weight: 100 + wildcardPolicy: None diff --git a/docs/operations/ci/ingress_controllers/service.yaml b/docs/operations/ci/ingress_controllers/service.yaml new file mode 100644 index 0000000..a74e98e --- /dev/null +++ b/docs/operations/ci/ingress_controllers/service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: test-service + namespace: ingress-controller-test +spec: + ports: + - name: 8080-tcp + port: 8080 + protocol: TCP + targetPort: 8080 + selector: + app: ingress-controller-test + sessionAffinity: None + type: ClusterIP + diff --git a/docs/operations/ci/installation/install.md b/docs/operations/ci/installation/install.md new file mode 100644 index 0000000..73d57a7 --- /dev/null +++ b/docs/operations/ci/installation/install.md @@ -0,0 +1,208 @@ +# Steps for installing OCP 4.3 on bare metal: + +Documentation: [docs](https://access.redhat.com/documentation/en-us/openshift_container_platform/4.3/html/installing_on_bare_metal/installing-on-bare-metal) + +## Install: +* mkdir ocp-ci-centos-org +* cd ocp-ci-centos-org +* For installations of OpenShift Container Platform that use user-provisioned infrastructure, you must manually generate your installation configuration file. +* 1.1.7.1. for sample config see: [here](https://projects.engineering.redhat.com/secure/attachment/104626/install-config.yaml.bak) + +``` +apiVersion: v1 +baseDomain: centos.org +compute: +- hyperthreading: Enabled + name: worker + replicas: 0 +controlPlane: + hyperthreading: Enabled + name: master + replicas: 3 +metadata: + name: ocp.ci +networking: + clusterNetwork: + - cidr: 10.128.0.0/14 + hostPrefix: 23 + networkType: OpenShiftSDN + serviceNetwork: + - 172.30.0.0/16 +platform: + none: {} +fips: false +pullSecret: '' +sshKey: '' +``` + + +* get the **pullsecret** from [https://cloud.redhat.com/openshift/install/metal/user-provisioned](https://cloud.redhat.com/openshift/install/metal/user-provisioned) requires your access.redhat.com login. +* “You must set the value of the replicas parameter to 0. This parameter controls the number of workers that the cluster creates and manages for you, which are functions that the cluster does not perform when you use user-provisioned infrastructure. You must manually deploy worker machines for the cluster to use before you finish installing OpenShift Container Platform.” +* **1.1.8**. Once the **install-config.yaml** configuration has been added correctly, take a backup of this file for future installs or reference as the next step will consume it. Then run the following: +* `openshift-install create manifests --dir=/home/dkirwan/ocp-ci-centos-org` + + INFO Consuming Install Config from target directory + WARNING Certificate 35183CE837878BAC77A802A8A00B6434857 from additionalTrustBundle is x509 v3 but not a certificate authority + WARNING Making control-plane schedulable by setting MastersSchedulable to true for Scheduler cluster settings. +* Running this command converts the **install-config.yaml** to a number of files eg: +``` + ~/ocp-ci-centos-org $ tree . + . + ├── manifests + │ ├── 04-openshift-machine-config-operator.yaml + │ ├── cluster-config.yaml + │ ├── cluster-dns-02-config.yml + │ ├── cluster-infrastructure-02-config.yml + │ ├── cluster-ingress-02-config.yml + │ ├── cluster-network-01-crd.yml + │ ├── cluster-network-02-config.yml + │ ├── cluster-proxy-01-config.yaml + │ ├── cluster-scheduler-02-config.yml + │ ├── cvo-overrides.yaml + │ ├── etcd-ca-bundle-configmap.yaml + │ ├── etcd-client-secret.yaml + │ ├── etcd-host-service-endpoints.yaml + │ ├── etcd-host-service.yaml + │ ├── etcd-metric-client-secret.yaml + │ ├── etcd-metric-serving-ca-configmap.yaml + │ ├── etcd-metric-signer-secret.yaml + │ ├── etcd-namespace.yaml + │ ├── etcd-service.yaml + │ ├── etcd-serving-ca-configmap.yaml + │ ├── etcd-signer-secret.yaml + │ ├── kube-cloud-config.yaml + │ ├── kube-system-configmap-root-ca.yaml + │ ├── machine-config-server-tls-secret.yaml + │ ├── openshift-config-secret-pull-secret.yaml + │ └── user-ca-bundle-config.yaml + └── openshift + ├── 99_kubeadmin-password-secret.yaml + ├── 99_openshift-cluster-api_master-user-data-secret.yaml + ├── 99_openshift-cluster-api_worker-user-data-secret.yaml + ├── 99_openshift-machineconfig_99-master-ssh.yaml + ├── 99_openshift-machineconfig_99-worker-ssh.yaml + └── openshift-install-manifests.yaml + 2 directories, 32 files +``` + +* Edit **manifests/cluster-scheduler-02-config.yml** and set **mastersSchedulable** to false. This will prevent Pods from being scheduled on the master instances. +* `sed -i 's/mastersSchedulable: true/mastersSchedulable: false/g' manifests/cluster-scheduler-02-config.yml` +* Create the machineconfigs to disable dhcp on the master/worker nodes: + +``` +for variant in master worker; do +cat << EOF > ./99_openshift-machineconfig_99-${variant}-nm-nodhcp.yaml +apiVersion: machineconfiguration.openshift.io/v1 +kind: MachineConfig +metadata: + labels: + machineconfiguration.openshift.io/role: ${variant} + name: nm-${variant}-nodhcp +spec: + config: + ignition: + config: {} + security: + tls: {} + timeouts: {} + version: 2.2.0 + networkd: {} + passwd: {} + storage: + files: + - contents: + source: data:text/plain;charset=utf-8;base64,W21haW5dCm5vLWF1dG8tZGVmYXVsdD0qCg== + verification: {} + filesystem: root + mode: 0644 + path: /etc/NetworkManager/conf.d/disabledhcp.conf + osImageURL: "" +EOF +done +``` + +* *NOTE* There is a gotcha here, fs mode is **octal** and should start with 0 eg 0644 (-rwxr--r--), however it will be **decimal** value 420 when queried later via kubernetes api. +* Create the ignition configurations: +* Rename `worker.ign` to `compute.ign`, as later steps in the process are configured to point at compute.ign. + +``` +openshift-install create ignition-configs --dir=/home/dkirwan/ocp-ci-centos-org +INFO Consuming OpenShift Install (Manifests) from target directory +INFO Consuming Common Manifests from target directory +INFO Consuming Master Machines from target directory +INFO Consuming Worker Machines from target directory +INFO Consuming Openshift Manifests from target directory + +# Should have the following layout +. +├── auth +│ ├── kubeadmin-password +│ └── kubeconfig +├── bootstrap.ign +├── master.ign +├── metadata.json +└── compute.ign +``` + + +* *NOTE* for production ie `ocp.ci` we must perform an extra step at this point, as the machines have 2 hard disks attached. We want to ensure that `/dev/sdb` gets its partition table wiped at bootstrapping time, so at a later time we can configure the Local Storage Operator to manage this disk drive. +* Modify the `master.ign` and `compute.ign` ignition files with the following: + +``` ++ "storage":{"disks":[{"device":"/dev/sdb","wipeTable":true}]}, +- "storage":{}, +``` + + +* **1.1.9. Creating Red Hat Enterprise Linux CoreOS (RHCOS) machines** +* Prerequisites: +* Obtain the Ignition config files for your cluster. +* Configure suitable PXE or iPXE infrastructure. +* Have access to an HTTP server that you can access from your computer. +* Have a load balancer eg Haproxy available +* You must download the kernel, initramfs, ISO file and the RAW disk files eg: +* [https://mirror.openshift.com/pub/openshift-v4/dependencies/rhcos/4.3/latest/](https://mirror.openshift.com/pub/openshift-v4/dependencies/rhcos/4.3/latest/) + * [rhcos-4.3.8-x86_64-installer-kernel-x86_64](https://mirror.openshift.com/pub/openshift-v4/dependencies/rhcos/4.3/latest/rhcos-4.3.8-x86_64-installer-kernel-x86_64) + * [rhcos-4.3.8-x86_64-installer-initramfs.x86_64.img](https://mirror.openshift.com/pub/openshift-v4/dependencies/rhcos/4.3/latest/rhcos-4.3.8-x86_64-installer-initramfs.x86_64.img) + * [rhcos-4.3.8-x86_64-installer.x86_64.iso](https://mirror.openshift.com/pub/openshift-v4/dependencies/rhcos/4.3/latest/rhcos-4.3.8-x86_64-installer.x86_64.iso) + * [rhcos-4.3.8-x86_64-metal.x86_64.raw.gz](https://mirror.openshift.com/pub/openshift-v4/dependencies/rhcos/4.3/latest/rhcos-4.3.8-x86_64-metal.x86_64.raw.gz) +* These files should be copied over to a webserver which is accessible from the bootstrap/master/compute instances. +* **1.1.9.2.** “Configure the network boot infrastructure so that the machines boot from their local disks after RHCOS is installed on them. “ +* Existing CentOS PXE boot configuration Ansible [example](https://github.com/CentOS/ansible-infra-playbooks/blob/master/templates/pxeboot.j2) +* Example RHCOS PXE boot configuration [here](https://projects.engineering.redhat.com/secure/attachment/104734/centos-ci-pxe_sampleconfig.txt) +* **1.1.10. Once the systems are booting and installing, you can monitor the installation with: `./openshift-install --dir=/home/dkirwan/ocp-ci-centos-org wait-for bootstrap-complete --log-level=info` +* Once the master nodes come up successfully, this command will exit. We can now remove the bootstrap instance, and repurpose it as a worker/compute node. +* Run the haproxy role, once the bootstrap node has been removed from the `ocp-ci-master-and-bootstrap-stg` ansible inventory group. +* Begin installing the compute/worker nodes. +* Once the workers are up accept them into the cluster by accepting their `csr` certs: +``` +# List the certs. If you see status pending, this is the worker/compute nodes attempting to join the cluster. It must be approved. +oc get csr + +# Accept all node CSRs one liner +oc get csr -o go-template='{{range .items}}{{if not .status}}{{.metadata.name}}{{"\n"}}{{end}}{{end}}' | xargs oc adm certificate approve +``` +* 1.1.11. Logging in to the cluster. At this point the cluster is up, and we’re in configuration territory. + + +## Manually test the bootstrap process RHCOS + +Resources: + +* [1] JIRA corresponding with this section: [CPE-661](https://projects.engineering.redhat.com/browse/CPE-661) +* [2] [https://github.com/CentOS/ansible-infra-playbooks/pull/4](https://github.com/CentOS/ansible-infra-playbooks/pull/4) +* [3] [https://scm.infra.centos.org/CentOS/ansible-inventory-ci/pulls/1](https://scm.infra.centos.org/CentOS/ansible-inventory-ci/pulls/1) +* [4] [https://scm.infra.centos.org/CentOS/ansible-pkistore-ci/pulls/1](https://scm.infra.centos.org/CentOS/ansible-pkistore-ci/pulls/1) +* [5] [CentOS/ansible-infra-playbooks/staging/templates/ocp_pxeboot.j2](https://raw.githubusercontent.com/CentOS/ansible-infra-playbooks/staging/templates/ocp_pxeboot.j2) +* [https://www.openshift.com/blog/openshift-4-bare-metal-install-quickstart](https://www.openshift.com/blog/openshift-4-bare-metal-install-quickstart) +* [6] [Create a raid enabled data volume via ignition file](https://coreos.com/ignition/docs/latest/examples.html#create-a-raid-enabled-data-volume) +* [7] HAProxy config for OCP4 [https://github.com/openshift-tigerteam/guides/blob/master/ocp4/ocp4-haproxy.cfg](https://github.com/openshift-tigerteam/guides/blob/master/ocp4/ocp4-haproxy.cfg) + + +Steps: + +* Create ssh key pair using `ssh-keygen` and uploaded it to the ansible-pkistore-ci repository at [4] +* Through trial and error, we’ve produced a PXE boot configuration for one of the machines and managed to get it to boot and begin the bootstrap process via an ignition file see [5]. +* Next steps is to make a decision on networking configuration then configure DNS and create 2 haproxy proxies before creating the bootstrap and master OCP nodes. Jiras created: [CPE-678](https://projects.engineering.redhat.com/browse/CPE-678), [CPE-677](https://projects.engineering.redhat.com/browse/CPE-677) and [CPE-676](https://projects.engineering.redhat.com/browse/CPE-676) +* PR configuration for the HAProxy loadbalancers: [here](https://github.com/CentOS/ansible-role-haproxy/pull/2) +* Configuration for DNS/bind (encrypted): [here](https://scm.infra.centos.org/CentOS/ansible-filestore-ci/src/branch/master/bind/ci.centos.org) diff --git a/docs/operations/ci/installation/persistant_storage_nfs.md b/docs/operations/ci/installation/persistant_storage_nfs.md new file mode 100644 index 0000000..a5384d1 --- /dev/null +++ b/docs/operations/ci/installation/persistant_storage_nfs.md @@ -0,0 +1,14 @@ +# Persistent storage via NFS +Once the NFS storage is configured and available for use within the cluster, we can create PVs with the following adhoc playbook: [ansible-infra-playbooks/adhoc-openshift-pv.yml](https://github.com/CentOS/ansible-infra-playbooks/blob/master/adhoc-openshift-pv.yml) + +Sample usage: + +``` +ansible-playbook playbooks/adhoc-openshift-pv.yml -e "host=" -e "pv_size=10Gi" -e "cico_project_name=project-pv-name" +``` + + + +Resources: +* [1] Jira [https://projects.engineering.redhat.com/browse/CPE-701](https://projects.engineering.redhat.com/browse/CPE-701) +* [2] Configuring NFS [https://docs.openshift.com/container-platform/4.4/storage/persistent_storage/persistent-storage-nfs.html](https://docs.openshift.com/container-platform/4.4/storage/persistent_storage/persistent-storage-nfs.html) diff --git a/docs/operations/ci/installation/prerequisites.md b/docs/operations/ci/installation/prerequisites.md new file mode 100644 index 0000000..222bc19 --- /dev/null +++ b/docs/operations/ci/installation/prerequisites.md @@ -0,0 +1,84 @@ +## Prerequisites +The following are the prerequisites required to install OCP4 on bare metal + +### Access confirmation + +* Access to [https://access.redhat.com/](https://access.redhat.com/), if not, follow the steps + * [https://mojo.redhat.com/docs/DOC-99172](https://mojo.redhat.com/docs/DOC-99172) + * [https://docs.google.com/document/d/15DmYrfspKVwf4z8DzPK7sU-zmRERVWHBN3tghRkrkGU/edit](https://docs.google.com/document/d/15DmYrfspKVwf4z8DzPK7sU-zmRERVWHBN3tghRkrkGU/edit) +* Git repo for the installer [https://github.com/openshift/installer](https://github.com/openshift/installer) +* OpenShift playground: [https://try.openshift.com](https://try.openshift.com/) +* Access.redhat.com account, to download packages/pull secrets [https://cloud.redhat.com/openshift/install](https://cloud.redhat.com/openshift/install) + * openshift-install client: [https://mirror.openshift.com/pub/openshift-v4/clients/ocp/latest/openshift-install-linux.tar.gz](https://mirror.openshift.com/pub/openshift-v4/clients/ocp/latest/openshift-install-linux.tar.gz) + * RHCOS download to create machines for your cluster to use during the installation [https://mirror.openshift.com/pub/openshift-v4/dependencies/rhcos/latest/latest/](https://mirror.openshift.com/pub/openshift-v4/dependencies/rhcos/latest/latest/) + * Openshift Command Line tools: [https://mirror.openshift.com/pub/openshift-v4/clients/ocp/latest/openshift-client-linux.tar.gz](https://mirror.openshift.com/pub/openshift-v4/clients/ocp/latest/openshift-client-linux.tar.gz) +* Official documentation for installation: + * [https://docs.openshift.com/container-platform/4.3/installing/installing_bare_metal/installing-bare-metal.html](https://docs.openshift.com/container-platform/4.3/installing/installing_bare_metal/installing-bare-metal.html) + * [https://docs.openshift.com/container-platform/4.3/architecture/architecture-installation.html#architecture-installation](https://docs.openshift.com/container-platform/4.3/architecture/architecture-installation.html#architecture-installation) +* Access RH employee subscription benefits: \ +[https://mojo.redhat.com/docs/DOC-99172 \ +https://docs.google.com/document/d/15DmYrfspKVwf4z8DzPK7sU-zmRERVWHBN3tghRkrkGU/edit](https://mojo.redhat.com/docs/DOC-99172) + + +### Bootstrap node Identification +As per [1], the minimum number of nodes needed for an Openshift 4 cluster is 6 + +* 1 bootstrap node +* 3 master nodes +* 2 worker nodes. + +As per [2] the minimum requirements for the bootstrap machine is: + + + + + + + + + + + + + + + + +
+ Machine + + Operating System + + vCPU + + RAM + + Storage +
+ Bootstrap + + RHCOS + + 4 + + 16 GB + + 120 GB +
+ + +* [1] Minimum number of nodes [https://docs.openshift.com/container-platform/4.3/installing/installing_bare_metal/installing-bare-metal.html#machine-requirements_installing-bare-metal](https://docs.openshift.com/container-platform/4.3/installing/installing_bare_metal/installing-bare-metal.html#machine-requirements_installing-bare-metal) +* [2] Minimum bootstrap/master/worker node requirements [https://docs.openshift.com/container-platform/4.3/installing/installing_bare_metal/installing-bare-metal.html#minimum-resource-requirements_installing-bare-metal](https://docs.openshift.com/container-platform/4.3/installing/installing_bare_metal/installing-bare-metal.html#minimum-resource-requirements_installing-bare-metal) +* [3] [https://ark.intel.com/content/www/us/en/ark/products/64591/intel-xeon-processor-e5-2640-15m-cache-2-50-ghz-7-20-gt-s-intel-qpi.html](https://ark.intel.com/content/www/us/en/ark/products/64591/intel-xeon-processor-e5-2640-15m-cache-2-50-ghz-7-20-gt-s-intel-qpi.html) + + +### Miscellaneous Prerequisites + +* Need internet access from the bootstrap/master/compute nodes so as to: +* Access the Red Hat OpenShift Cluster Manager page to download the installation program and perform subscription management and entitlement. If the cluster has internet access and you do not disable Telemetry, that service automatically entitles your cluster. If the Telemetry service cannot entitle your cluster, you must manually entitle it on the Cluster registration page +* Access quay.io to obtain the packages (images?) that are required to install your cluster. +* Obtain the packages that are required to perform cluster updates. +* **1.1.3.1**. Before you install OpenShift Container Platform, you must provision two layer-4 load balancers. +* Minimum of 6 nodes, 1 bootstrap node, 3 master, 2 compute. +* **1.1.3**. See this section to see the **network ports** which are required to be open and accessible from each machine +* Configure DHCP or set static IP addresses on each node. Be sure to configure it so the nodes always get the same IP address if configured via DHCP. diff --git a/docs/operations/ci/installation/verification.md b/docs/operations/ci/installation/verification.md new file mode 100644 index 0000000..3e65ce0 --- /dev/null +++ b/docs/operations/ci/installation/verification.md @@ -0,0 +1,7 @@ +# Adding some workloads for testing +Openshift 4, ships with a number of operators already configured and available via OperatorHub. We have tested with the Jenkinsci operator 0.4.0[1]. + +Resources: +* [1] jenkinsci/kubernetes/operator: [github](https://github.com/jenkinsci/kubernetes-operator) +* [2] Deploy the jenkinsci/kubernetes-operator on Kubernetes: [deploy yaml](https://raw.githubusercontent.com/jenkinsci/kubernetes-operator/master/deploy/all-in-one-v1alpha2.yaml) +* [3] Changes required to make this work correctly on Openshift: [gist](https://gist.github.com/davidkirwan/d3301c550c94dd1a95965dd8d7a91594) diff --git a/docs/operations/ci/kubevirt.md b/docs/operations/ci/kubevirt.md new file mode 100644 index 0000000..0a30aa3 --- /dev/null +++ b/docs/operations/ci/kubevirt.md @@ -0,0 +1,40 @@ +# kubevirt Instruction + +`Note: This doc is full of snippets of official doc in order to keep it to point. This is not to be considered a documentation/guide for others. Please refer official guide. This is mere a note for CentOS CI admins based on our workflow` + +## How to install Kubevirt in cluster + +* Open a browser window and log in to the OpenShift Container Platform web console. +* Navigate to the Operators → OperatorHub page. +* Search for Container-native virtualization and then select it. +* Read the information about the Operator and click Install. +* On the Create Operator Subscription page: + * For Installed Namespace, ensure that the Operator recommended namespace option is selected. This installs the Operator in the mandatory openshift-cnv namespace, which is automatically created if it does not exist. + + * Select 2.3 from the list of available Update Channel options. + + * Click Subscribe to make the Operator available to the openshift-cnv namespace. + +On the Installed Operators screen, the Status displays Succeeded when container-native virtualization finishes installation. + +## Deploying container-native virtualization + +After subscribing to the Container-native virtualization catalog, create the CNV Operator Deployment custom resource to deploy container-native virtualization. + +* Navigate to the Operators → Installed Operators page. +* Click Container-native virtualization. +* Click the CNV Operator Deployment tab and click Create HyperConverged Cluster. +* Click Create to launch container-native virtualization. +* Navigate to the Workloads → Pods page and monitor the container-native virtualization Pods until they are all Running. After all the Pods display the Running state, you can access container-native virtualization. + + +## creating a vm + +* create a vm template (or for testing if kubevirt works in your cluster, you can also use a test template from kubevirt: `https://raw.githubusercontent.com/kubevirt/demo/master/manifests/vm.yaml`) +* once you have your template ready, type `oc create -f ` or for test purpose `oc create -f https://raw.githubusercontent.com/kubevirt/demo/master/manifests/vm.yaml` +* once it returns success, check if the vm is created with `oc get vm` +* Go to webUI to start the vm and you should be able to see all there is to see in a vm. + +VMs created are in state off by default. To control them from CLI, you need to install kubevirt-virtctl. Find [instruction here](https://docs.openshift.com/container-platform/4.4/cnv/cnv_install/cnv-installing-virtctl.html#cnv-enabling-cnv-repos_cnv-installing-virtctl) + + diff --git a/docs/operations/ci/localstorage/binding_pvc.md b/docs/operations/ci/localstorage/binding_pvc.md new file mode 100644 index 0000000..ad7aa1d --- /dev/null +++ b/docs/operations/ci/localstorage/binding_pvc.md @@ -0,0 +1,28 @@ +# Binding a PVC to a local storage PV + +In order to bind a PVC to a local storage PV you can do so using the following: + +Steps: + +* Create a PersistantVolumeClaim object like the following, simply update the `NAMESPACE` to match the namespace this PVC will be created in, the `NAME` the name of the PVC, the `SIZE` the size which matches the local storage PV and finally the `LOCAL_STORAGE_PV_NAME` to match the local storage PV's name which you wish to bind it to. +* Important, don't chose a local storage PV name which exists on a master node, as they are marked as unschedulable for user workloads. + + +``` +kind: PersistentVolumeClaim +apiVersion: v1 +metadata: + name: NAME + namespace: NAMESPACE + finalizers: + - kubernetes.io/pvc-protection +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: SIZE + volumeName: LOCAL_STORAGE_PV_NAME + storageClassName: local-sc + volumeMode: Filesystem +``` diff --git a/docs/operations/ci/localstorage/installation.md b/docs/operations/ci/localstorage/installation.md new file mode 100644 index 0000000..2aa1fac --- /dev/null +++ b/docs/operations/ci/localstorage/installation.md @@ -0,0 +1,63 @@ +# Adding Local Storage +Planning to make use of the Local Storage Operator to format the /dev/sdb disks on each node. Following the instructions at [4]. + +Resources: + +* [1] 1.3.12.1. [https://access.redhat.com/documentation/en-us/openshift_container_platform/4.3/html/installing_on_bare_metal/installing-on-bare-metal](https://access.redhat.com/documentation/en-us/openshift_container_platform/4.3/html/installing_on_bare_metal/installing-on-bare-metal) +* [2] Parameters to configure the image registry operator [https://access.redhat.com/documentation/en-us/openshift_container_platform/4.3/html-single/registry/index#registry-operator-configuration-resource-overview_configuring-registry-operator](https://access.redhat.com/documentation/en-us/openshift_container_platform/4.3/html-single/registry/index#registry-operator-configuration-resource-overview_configuring-registry-operator) +* [3] [https://docs.openshift.com/container-platform/4.4/storage/understanding-persistent-storage.html](https://docs.openshift.com/container-platform/4.4/storage/understanding-persistent-storage.html) +* [4] Configuring local storage [https://docs.openshift.com/container-platform/4.4/storage/persistent_storage/persistent-storage-local.html](https://docs.openshift.com/container-platform/4.4/storage/persistent_storage/persistent-storage-local.html) +* [5] Configuring nfs storage [https://docs.openshift.com/container-platform/4.4/storage/persistent_storage/persistent-storage-nfs.html](https://docs.openshift.com/container-platform/4.4/storage/persistent_storage/persistent-storage-nfs.html) +* [6] Persistent storage accessModes [https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes) + +Steps: + +* Installed the Local Storage Operator via instructions at [4] +* Created a LocalVolume object via instructions at [4], see contents: [link](https://gist.github.com/davidkirwan/4cfbee653ecbab70484c9ce878e5eb90) +* The documentation at [4] suggest that you can simply patch the daemonset config to add configuration to run on master nodes also. This is not true. The Local Storage Operator will revert any changes to the objects which it is managing. This change instead must be made to the LocalStorage object created at step 2. +* Daemonset pod runs on each node that matches the selector in the LocalVolume object: + + +``` +oc get ds +NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE +local-disks-local-diskmaker 7 7 7 7 7 58m +local-disks-local-provisioner 7 7 7 7 7 58m +``` + + +* I had to manually go onto each node, and wipe the partition table on the /dev/sdb drives, then reboot the node one at a time. +* Upon rebooting, the daemonset pods format the disks and create a persistent volume. + + +``` +oc get pv +NAME CAPACITY ACCESS MODES RECLAIM POLICY STATUS CLAIM STORAGECLASS REASON AGE +local-pv-5ebe93a 223Gi RWO Delete Available local-sc 56m +local-pv-67553558 223Gi RWO Delete Available local-sc 46m +local-pv-6aa59705 223Gi RWO Delete Available local-sc 31s +local-pv-cae6207 223Gi RWO Delete Available local-sc 9m6s +local-pv-f5985e6f 223Gi RWO Delete Available local-sc 50m +local-pv-f761542e 223Gi RWO Delete Available local-sc 3m52s +local-pv-f9d2a890 223Gi RWO Delete Available local-sc 35m +``` + +* RWO is ReadWriteOnce, which means you can only attach the volume to a single pod. Thats not what we want here, we want to be able to attach the volume to many pods potentially see [6]. +* Rather than editing each pv one at a time, and changing the access from ReadWriteOnce to ReadWriteMany instead run the following which should handle the task automatically: + +``` +for i in $(oc get pv --selector storage.openshift.com/local-volume-owner-namespace=local-storage -o custom-columns="name:.metadata.name" | tail -n +$((2))); do oc patch pv $i --patch '{"spec":{"accessModes":["ReadWriteMany"]}}';done +``` + + +``` +oc get pv +NAME CAPACITY ACCESS MODES RECLAIM POLICY STATUS CLAIM STORAGECLASS REASON AGE +local-pv-5ebe93a 223Gi RWX Delete Available local-sc 69m +local-pv-67553558 223Gi RWX Delete Available local-sc 60m +local-pv-6aa59705 223Gi RWX Delete Available local-sc 14m +local-pv-cae6207 223Gi RWX Delete Available local-sc 22m +local-pv-f5985e6f 223Gi RWX Delete Available local-sc 64m +local-pv-f761542e 223Gi RWX Delete Available local-sc 17m +local-pv-f9d2a890 223Gi RWX Delete Available local-sc 49m +``` diff --git a/docs/operations/ci/migration/migrate_to_new_cluster.md b/docs/operations/ci/migration/migrate_to_new_cluster.md new file mode 100644 index 0000000..0cb237a --- /dev/null +++ b/docs/operations/ci/migration/migrate_to_new_cluster.md @@ -0,0 +1,75 @@ +# Action +If you are currently using apps.ci.centos.org (OCP 3.6) or have a job running in ci.centos.org, please contact us so that we can create a namespace for you in the new OCP 4.4 cluster for you to start migrating your jobs. + + +## If your own a namespace on apps.ci.centos.org +There shouldn't be a lot of changes for you as you should already have a familiar workflow working with Openshift, if you have configured your jobs at least once. While opening a ticket, or reaching out to us please do mention if you need ability to checkout duffy nodes (if yes, do you already have a duffy API key?). + +See the following which explains how to open a ticket (Point 3) + + +## If your project is on ci.centos.org +As you are using central jenkins deployment that's not private to you, your project might see some changes. You will have your own jenkins deployment to run all your jobs instead of having a shared jenkins instance where you don't have admin access. With the extra privileges in jenkins, you can add plugins, credentials as you need and a bunch of other abilities. +This also means there is no direct maintenance required from CentOS CI Infra admins as openshift takes care of things dying or crashing and anything in the jobs config, you can take care of them yourself (We will help you get started and are always in reach if you notice any hiccups that is not auto healed). You can have the same workflow or use this opportunity to change your freestyle jobs to pipelines but that's totally on you. +See the following which explains how to open a ticket (Point 3) + + +### Open a project ticket + +Create a new ticket in [pagure.io/centos-infra/issues](https://pagure.io/centos-infra/issues) with type ci-migration +template or [click here](https://pagure.io/centos-infra/new_issue?template=ci-migration). + +We will have your accounts created in 48 hours (if no doubts/followup needed). + +Note: This is only applicable for already existing projects in other places. New projects will have to go through a different process/evaluation and we may need more time (this is also applicable for projects requesting extra privileges than default admin level access to namespace). + + +## Setting up your jobs in cico-workspace jenkins +Configuring your jobs should be similar to your older configuration. You can either have your jobs written in trigger jobs -> groovy dialogue box, or source it from your repo (we recommend the later for easy management of jenkinsfile). One thing to point, in groovy, the node parameter type defines where the target job should be executed, the value must match either a label or a node name - otherwise the job will just stay in the queue. +We have a custom label called `cico-workspace` that has python-cicoclient installed in it so that you can request duffy nodes. This workspace will also have your duffy-api-key exported so that you can directly request nodes from duffy. Here is an example of jenkins file + +``` +node('cico-workspace') { + stage('get cico node') { + node = sh(script: "cico --debug node get -f value -c hostname -c comment", returnStdout: true).trim().tokenize(' ') + env.node_hostname = "${node[0]}.ci.centos.org" + env.node_ssid = "${node[1]}" + } + + stage('tests-example') { + println("Put your tests here") + println("example running a test on ${node_hostname} ${node_ssid}") + } + + stage('builds-example') { + println("Put your builds here") + println("example running a build on ${node_hostname} ${node_ssid}") + } + + stage('return cico node') { + sh 'cico node done ${node_ssid} > commandResult' + } +} +``` + +## Configuring the Kubernetes concurrency limit (number of cloud executors) +As the *standard* (static) Jenkins executors have been replaced by cloud-based ones, the limit is now configured in a different place. +The configuration now resides under: + +Manage Jenkins -> Manage Nodes and Clouds -> Configure Clouds -> Kubernetes -> Kubernetes Cloud details... -> Concurrency Limit + +By default, the limit is set to 100, i.e. 100 parallel jobs at once. This default is in most scenarios way too high and can cause a quick +Duffy pool depletion (if the spawned pods use the Duffy nodes). So, to be a good netizen, it is recommended to set this number to some +sensible value (like 15 or 20, etc. - depends on your project), just to take in consideration the other users of the Duffy nodes & Kubernetes cluster. + +## Configuring SMTP server for email notifications +If you used job email notifications in the old (legacy) instance, you will probably find out that these don't work out-of-the box in the OCP +instances. Fortunately, the solution is simple and consists of two steps: + +1. In **Manage Jenkins** -> **Configure System** in the **Jenkins Location** section fill in a valid **System Admin e-mail address**. Usually you want to use + something like *builder@\* (which is also mentioned in the **Jenkins URL** field above). So, for example, + the whole email address can be *builder@my-fabulous-app.apps.ocp.ci.centos.org* (the *builder* part is optional, you can use whatever you want). + +2. Again, in **Manage Jenkins** -> **Configure System** in the section **E-mail Notifications** fill in the current SMTP relay for OCP instances, which + is **smtp.ci.centos.org**. Then, to check if everything works as it should, check the **Test configuration by sending test e-mail** checkbox, fill in + the recipient, and click on **Test configuration**. If the email arrives, you should be all set. diff --git a/docs/operations/ci/monitoring.md b/docs/operations/ci/monitoring.md new file mode 100644 index 0000000..701fbb1 --- /dev/null +++ b/docs/operations/ci/monitoring.md @@ -0,0 +1,7 @@ +# Monitoring +There is a monitoring stack deployed as part of Openshift, see the “openshift-monitoring” namespace. View routes to see the Prometheus, Grafana and Alertmanager services. +We can use this monitoring stack, or alternatively deploy our own via [1]. Either way, we are responsible for maintaining it since its baremetal. + +Resources: +* [1] [https://docs.openshift.com/container-platform/4.4/monitoring/monitoring-your-own-services.html](https://docs.openshift.com/container-platform/4.4/monitoring/monitoring-your-own-services.html) +* [2] Prometheus exporting to Zabbix [https://www.zabbix.com/integrations/prometheus](https://www.zabbix.com/integrations/prometheus) diff --git a/docs/operations/ci/ocp.md b/docs/operations/ci/ocp.md deleted file mode 100644 index e69de29..0000000 --- a/docs/operations/ci/ocp.md +++ /dev/null diff --git a/docs/operations/ci/onboarding_sysadmin.md b/docs/operations/ci/onboarding_sysadmin.md new file mode 100644 index 0000000..ace4d5b --- /dev/null +++ b/docs/operations/ci/onboarding_sysadmin.md @@ -0,0 +1,87 @@ +# CentOS Infra Onboarding + +### Mailing lists + +- https://lists.centos.org/mailman/listinfo/ci-users +- https://lists.centos.org/mailman/listinfo/centos-devel +- https://lists.centos.org/mailman/listinfo/centos-infra +- ci-sysadmin@centos.org + +### IRC Channels +- #centos-ci +- #centos +- #centos-meeting + +### Openshift +Containers + +STG: +3 master/control nodes +5 worker/compute + +Prod: +3 master/control +9 worker/compute + + +### Open Nebula +Application for managing VM instances +GUI is only available within the network, can do things like ssh port forwarding to access the console + +- 1 frontend control node +- 9 hypervisor nodes + + + +### Duffy +Application for managing Bare metal nodes + +Rest API +beanstalkd message bus +Ansible playbooks, IPMI, PXE +API client python-cicoclient +Reserve a bare-metal for 6 hours +We maintain multiple versions of CentOS several different architectures for baremetal nodes in a pool ready to be consumed +eg: CentOS 6, 7, 8, 8-stream. + +Architecture node counts in pool: + +``` +x86-64 83 +i386 7 +ppc64 / ppc64le 45 +aarch64 20 +``` + +- Duffy runs on: admin.ci.centos.org +- https://github.com/centos/duffy + + +### Legacy and (Legacy Legacy) +- OKD 3.6 https://console.apps.ci.centos.org:8443/ +- Jenkins: https://ci.centos.org/ + + +### Resources + +- Working with CentOS Infra https://docs.fedoraproject.org/en-US/cpe/day_to_day_centos/ +- Ticket Tracker: pagure.io/centos-infra [Use template ci-migration for migration or + +- Fedora Nest 2020 talk: https://docs.google.com/presentation/d/1Efwz73t4NdxuzmyV8kvi3SewCIzDpidpePXGLfuweso/edit?ts=5f2ea19e#slide=id.g8fead7ec8b_0_44 + +- CentOS ACO (FAS): https://accounts.centos.org/ +- SSH Config Jump host: https://wiki.centos.org/TipsAndTricks/SshTips/JumpHost + +- CentOS Infra Playbooks https://github.com/CentOS/ansible-infra-playbooks +- Inventory: https://scm.infra.centos.org/CentOS/ansible-inventory-ci +- pkistore: https://scm.infra.centos.org/CentOS/ansible-pkistore-ci +- filestore: https://scm.infra.centos.org/CentOS/ansible-filestore-ci + +- WIKI: https://wiki.centos.org/ + +- Centos QA Kanban: https://apps.centos.org/kanboard/board/28 +- Centos QA Jenkins: https://console.qa.centos.org/ + +- Openshift 4 SOPs/Spikes: https://github.com/centosci/ocp4-docs +- OCP4 Staging: https://console-openshift-console.apps.ocp.stg.ci.centos.org/ +- OCP4 Production: https://console-openshift-console.apps.ocp.ci.centos.org/ diff --git a/docs/operations/ci/outage_preparation.md b/docs/operations/ci/outage_preparation.md new file mode 100644 index 0000000..38b0200 --- /dev/null +++ b/docs/operations/ci/outage_preparation.md @@ -0,0 +1,43 @@ +# CentOS CI Infra Outage Preparation +During a scheduled outage where it is likely we will lose network access entirely to the entire rack, or between racks, it is advisable to shutdown the following services: + +- Duffy +- CentOS CI Openshift prod/stg +- Legacy CI Jenkins +- Legacy OKD +- keepalived on gateway02.ci.centos.org + + +## Legacy OKD +1. bstinson, as the only person on the team which has access to the legacy OKD cluster, must handle tasks related to this cluster. + + +## OCP +https://github.com/centosci/ocp4-docs/blob/master/sops/create_etcd_backup.md +https://github.com/centosci/ocp4-docs/blob/master/sops/cordoning_nodes_and_draining_pods.md +https://github.com/centosci/ocp4-docs/blob/master/sops/graceful_shutdown_ocp_cluster.md + +Admin nodes +Prod: ocp-admin.ci.centos.org +Stg: n4-136.cloud.ci.centos.org + +2. Take etcd backup to the admin node associated with prod/stg +3. Cordon and drain all nodes +4. gracefully shutdown + + +## Duffy + +5. switch off duffy - workers + * source duffy2-venv/bin/activate; FLASK_APP=duffy DUFFY_SETTINGS=/etc/duffy.conf python scripts/worker.py +6. switch off duffy server + * FLASK_APP=duffy DUFFY_SETTINGS=/etc/duffy.conf flask run -h 0.0.0.0 -p 8080 +7. ci.centos.org legacy jenkins: manage jenkins, prepare for shutdown + * ssh jenkins - systemctl restart jenkins + +## keepalived on Gateway nodes + +8. Shutdown keepalived on gateway02.ci.centos.org + * sudo systemctl stop keepalived + + diff --git a/docs/operations/ci/replacing_certs.md b/docs/operations/ci/replacing_certs.md new file mode 100644 index 0000000..a8cae5a --- /dev/null +++ b/docs/operations/ci/replacing_certs.md @@ -0,0 +1,20 @@ +# Lets Encrypt Certs +We need a valid cert for both the wildcard \*.apps.ocp.ci.centos.org and api.ocp.ci.centos.org. We have a role/task which covers the replacing of TLS certs on ocp.ci/ocp.stg.ci `https://github.com/CentOS/ansible-role-ocp-admin-node/blob/master/tasks/tls.yml`. + +A separate process is performed to request certs from LetsEncrypt, and stores the cacert/certs/key within the pkistore for ocp.ci/ocp.stg.ci. + +To deploy the certs to the cluster, we run the following playbook: `https://github.com/CentOS/ansible-infra-playbooks/blob/master/role-ocp-admin-node.yml`. + +eg: + +``` +ansible-playbook playbooks/role-ocp-admin-node.yml --tags "tls, certs" +``` + + +Resources: +* [1] Certman Operator (looks like this requires HIVE[5] so not going to work) [https://github.com/openshift/certman-operator](https://github.com/openshift/certman-operator) +* [2] Changing the cert in OCP4 [https://docs.openshift.com/container-platform/4.4/authentication/certificates/replacing-default-ingress-certificate.html](https://docs.openshift.com/container-platform/4.4/authentication/certificates/replacing-default-ingress-certificate.html) +* [3] RHMI SOP for manually replacing certs on 3.11 cluster, many steps similar: [https://github.com/RHCloudServices/integreatly-help/blob/master/sops/POC_cert_renewal.asciidoc](https://github.com/RHCloudServices/integreatly-help/blob/master/sops/POC_cert_renewal.asciidoc) +* [4] Option Brian suggested: [https://github.com/tnozicka/openshift-acme](https://github.com/tnozicka/openshift-acme) +* [5] HIVE [https://github.com/openshift/hive](https://github.com/openshift/hive) diff --git a/docs/operations/ci/seamicro/RESTfulAPIsPhase2_f2.pdf b/docs/operations/ci/seamicro/RESTfulAPIsPhase2_f2.pdf new file mode 100644 index 0000000..6abfb51 Binary files /dev/null and b/docs/operations/ci/seamicro/RESTfulAPIsPhase2_f2.pdf differ diff --git a/docs/operations/ci/seamicro/SeaMicro_Rel_3.4_CLI_Guide_Oct-19-2013_Edition-1.pdf b/docs/operations/ci/seamicro/SeaMicro_Rel_3.4_CLI_Guide_Oct-19-2013_Edition-1.pdf new file mode 100644 index 0000000..c071943 --- /dev/null +++ b/docs/operations/ci/seamicro/SeaMicro_Rel_3.4_CLI_Guide_Oct-19-2013_Edition-1.pdf @@ -0,0 +1,7761 @@ +%PDF-1.6 %���� +3849 0 obj <> endobj +xref +3849 538 +0000000016 00000 n +0000014624 00000 n +0000014833 00000 n +0000014862 00000 n +0000014914 00000 n +0000014972 00000 n +0000015023 00000 n +0000015064 00000 n +0000015214 00000 n +0000015298 00000 n +0000015379 00000 n +0000015463 00000 n +0000015547 00000 n +0000015631 00000 n +0000015715 00000 n +0000015799 00000 n +0000015883 00000 n +0000015967 00000 n +0000016051 00000 n +0000016135 00000 n +0000016219 00000 n +0000016303 00000 n +0000016387 00000 n +0000016471 00000 n +0000016555 00000 n +0000016639 00000 n +0000016723 00000 n +0000016807 00000 n +0000016891 00000 n +0000016975 00000 n +0000017059 00000 n +0000017143 00000 n +0000017227 00000 n +0000017311 00000 n +0000017395 00000 n +0000017479 00000 n +0000017563 00000 n +0000017647 00000 n +0000017731 00000 n +0000017815 00000 n +0000017899 00000 n +0000017983 00000 n +0000018067 00000 n +0000018151 00000 n +0000018235 00000 n +0000018319 00000 n +0000018403 00000 n +0000018487 00000 n +0000018571 00000 n +0000018655 00000 n +0000018739 00000 n +0000018823 00000 n +0000018907 00000 n +0000018991 00000 n +0000019075 00000 n +0000019159 00000 n +0000019243 00000 n +0000019327 00000 n +0000019411 00000 n +0000019495 00000 n +0000019579 00000 n +0000019663 00000 n +0000019747 00000 n +0000019831 00000 n +0000019915 00000 n +0000019999 00000 n +0000020083 00000 n +0000020167 00000 n +0000020251 00000 n +0000020335 00000 n +0000020419 00000 n +0000020503 00000 n +0000020587 00000 n +0000020671 00000 n +0000020755 00000 n +0000020839 00000 n +0000020923 00000 n +0000021007 00000 n +0000021091 00000 n +0000021175 00000 n +0000021259 00000 n +0000021343 00000 n +0000021427 00000 n +0000021511 00000 n +0000021595 00000 n +0000021679 00000 n +0000021763 00000 n +0000021847 00000 n +0000021931 00000 n +0000022015 00000 n +0000022099 00000 n +0000022183 00000 n +0000022267 00000 n +0000022351 00000 n +0000022435 00000 n +0000022519 00000 n +0000022603 00000 n +0000022686 00000 n +0000022769 00000 n +0000022852 00000 n +0000022935 00000 n +0000023018 00000 n +0000023101 00000 n +0000023184 00000 n +0000023267 00000 n +0000023350 00000 n +0000023433 00000 n +0000023516 00000 n +0000023599 00000 n +0000023682 00000 n +0000023765 00000 n +0000023848 00000 n +0000023931 00000 n +0000024014 00000 n +0000024097 00000 n +0000024180 00000 n +0000024263 00000 n +0000024346 00000 n +0000024429 00000 n +0000024512 00000 n +0000024595 00000 n +0000024678 00000 n +0000024761 00000 n +0000024844 00000 n +0000024927 00000 n +0000025010 00000 n +0000025093 00000 n +0000025176 00000 n +0000025259 00000 n +0000025342 00000 n +0000025425 00000 n +0000025508 00000 n +0000025591 00000 n +0000025674 00000 n +0000025757 00000 n +0000025840 00000 n +0000025923 00000 n +0000026006 00000 n +0000026089 00000 n +0000026172 00000 n +0000026255 00000 n +0000026338 00000 n +0000026421 00000 n +0000026504 00000 n +0000026587 00000 n +0000026670 00000 n +0000026753 00000 n +0000026836 00000 n +0000026919 00000 n +0000027002 00000 n +0000027085 00000 n +0000027168 00000 n +0000027251 00000 n +0000027334 00000 n +0000027417 00000 n +0000027500 00000 n +0000027583 00000 n +0000027666 00000 n +0000027749 00000 n +0000027832 00000 n +0000027915 00000 n +0000027998 00000 n +0000028081 00000 n +0000028164 00000 n +0000028247 00000 n +0000028330 00000 n +0000028413 00000 n +0000028496 00000 n +0000028579 00000 n +0000028662 00000 n +0000028745 00000 n +0000028828 00000 n +0000028911 00000 n +0000028994 00000 n +0000029077 00000 n +0000029160 00000 n +0000029243 00000 n +0000029326 00000 n +0000029409 00000 n +0000029492 00000 n +0000029575 00000 n +0000029658 00000 n +0000029741 00000 n +0000029824 00000 n +0000029907 00000 n +0000029990 00000 n +0000030073 00000 n +0000030156 00000 n +0000030239 00000 n +0000030322 00000 n +0000030405 00000 n +0000030488 00000 n +0000030571 00000 n +0000030654 00000 n +0000030737 00000 n +0000030820 00000 n +0000030903 00000 n +0000030986 00000 n +0000031069 00000 n +0000031152 00000 n +0000031235 00000 n +0000031318 00000 n +0000031401 00000 n +0000031484 00000 n +0000031567 00000 n +0000031650 00000 n +0000031733 00000 n +0000031816 00000 n +0000031899 00000 n +0000031982 00000 n +0000032065 00000 n +0000032148 00000 n +0000032231 00000 n +0000032314 00000 n +0000032397 00000 n +0000032480 00000 n +0000032563 00000 n +0000032646 00000 n +0000032729 00000 n +0000032812 00000 n +0000032895 00000 n +0000032978 00000 n +0000033061 00000 n +0000033144 00000 n +0000033227 00000 n +0000033310 00000 n +0000033393 00000 n +0000033476 00000 n +0000033559 00000 n +0000033642 00000 n +0000033725 00000 n +0000033808 00000 n +0000033891 00000 n +0000033974 00000 n +0000034057 00000 n +0000034140 00000 n +0000034223 00000 n +0000034306 00000 n +0000034389 00000 n +0000034472 00000 n +0000034555 00000 n +0000034638 00000 n +0000034721 00000 n +0000034804 00000 n +0000034887 00000 n +0000034970 00000 n +0000035053 00000 n +0000035136 00000 n +0000035219 00000 n +0000035302 00000 n +0000035385 00000 n +0000035468 00000 n +0000035551 00000 n +0000035634 00000 n +0000035717 00000 n +0000035800 00000 n +0000035883 00000 n +0000035966 00000 n +0000036049 00000 n +0000036132 00000 n +0000036215 00000 n +0000036297 00000 n +0000036379 00000 n +0000036460 00000 n +0000036605 00000 n +0000037159 00000 n +0000037741 00000 n +0000037993 00000 n +0000038097 00000 n +0000038355 00000 n +0000040969 00000 n +0000069238 00000 n +0000104520 00000 n +0000105066 00000 n +0000105203 00000 n +0000136833 00000 n +0000136874 00000 n +0000136934 00000 n +0000137058 00000 n +0000137191 00000 n +0000137333 00000 n +0000137474 00000 n +0000137634 00000 n +0000137730 00000 n +0000137860 00000 n +0000137992 00000 n +0000138151 00000 n +0000138253 00000 n +0000138428 00000 n +0000138617 00000 n +0000138725 00000 n +0000138831 00000 n +0000139014 00000 n +0000139122 00000 n +0000139236 00000 n +0000139422 00000 n +0000139514 00000 n +0000139610 00000 n +0000139712 00000 n +0000139818 00000 n +0000139920 00000 n +0000140038 00000 n +0000140180 00000 n +0000140298 00000 n +0000140412 00000 n +0000140512 00000 n +0000140622 00000 n +0000140728 00000 n +0000140856 00000 n +0000140988 00000 n +0000141126 00000 n +0000141274 00000 n +0000141410 00000 n +0000141542 00000 n +0000141674 00000 n +0000141802 00000 n +0000141932 00000 n +0000142082 00000 n +0000142218 00000 n +0000142350 00000 n +0000142500 00000 n +0000142636 00000 n +0000142768 00000 n +0000142916 00000 n +0000143036 00000 n +0000143162 00000 n +0000143262 00000 n +0000143384 00000 n +0000143520 00000 n +0000143644 00000 n +0000143762 00000 n +0000143866 00000 n +0000143972 00000 n +0000144074 00000 n +0000144198 00000 n +0000144320 00000 n +0000144440 00000 n +0000144562 00000 n +0000144688 00000 n +0000144812 00000 n +0000144934 00000 n +0000145058 00000 n +0000145160 00000 n +0000145266 00000 n +0000145380 00000 n +0000145482 00000 n +0000145588 00000 n +0000145706 00000 n +0000145808 00000 n +0000145910 00000 n +0000146016 00000 n +0000146124 00000 n +0000146224 00000 n +0000146346 00000 n +0000146474 00000 n +0000146586 00000 n +0000146714 00000 n +0000146828 00000 n +0000146940 00000 n +0000147086 00000 n +0000147232 00000 n +0000147348 00000 n +0000147480 00000 n +0000147582 00000 n +0000147712 00000 n +0000147816 00000 n +0000147958 00000 n +0000148092 00000 n +0000148228 00000 n +0000148356 00000 n +0000148470 00000 n +0000148610 00000 n +0000148770 00000 n +0000148908 00000 n +0000149042 00000 n +0000149158 00000 n +0000149276 00000 n +0000149402 00000 n +0000149530 00000 n +0000149676 00000 n +0000149830 00000 n +0000150020 00000 n +0000150156 00000 n +0000150272 00000 n +0000150388 00000 n +0000150518 00000 n +0000150644 00000 n +0000150778 00000 n +0000150912 00000 n +0000151034 00000 n +0000151150 00000 n +0000151284 00000 n +0000151418 00000 n +0000151560 00000 n +0000151672 00000 n +0000151800 00000 n +0000151956 00000 n +0000152076 00000 n +0000152190 00000 n +0000152308 00000 n +0000152434 00000 n +0000152558 00000 n +0000152666 00000 n +0000152768 00000 n +0000152896 00000 n +0000153048 00000 n +0000153240 00000 n +0000153412 00000 n +0000153538 00000 n +0000153688 00000 n +0000153852 00000 n +0000154016 00000 n +0000154154 00000 n +0000154294 00000 n +0000154422 00000 n +0000154554 00000 n +0000154692 00000 n +0000154836 00000 n +0000154946 00000 n +0000155048 00000 n +0000155172 00000 n +0000155272 00000 n +0000155388 00000 n +0000155492 00000 n +0000155630 00000 n +0000155732 00000 n +0000155864 00000 n +0000155990 00000 n +0000156114 00000 n +0000156244 00000 n +0000156370 00000 n +0000156504 00000 n +0000156630 00000 n +0000156758 00000 n +0000156928 00000 n +0000157086 00000 n +0000157250 00000 n +0000157376 00000 n +0000157500 00000 n +0000157612 00000 n +0000157730 00000 n +0000157844 00000 n +0000157978 00000 n +0000158098 00000 n +0000158226 00000 n +0000158414 00000 n +0000158546 00000 n +0000158692 00000 n +0000158824 00000 n +0000158964 00000 n +0000159094 00000 n +0000159222 00000 n +0000159350 00000 n +0000159486 00000 n +0000159614 00000 n +0000159746 00000 n +0000159864 00000 n +0000159976 00000 n +0000160102 00000 n +0000160234 00000 n +0000160364 00000 n +0000160504 00000 n +0000160636 00000 n +0000160762 00000 n +0000160950 00000 n +0000161098 00000 n +0000161230 00000 n +0000161354 00000 n +0000161492 00000 n +0000161628 00000 n +0000161750 00000 n +0000161860 00000 n +0000161972 00000 n +0000162088 00000 n +0000162226 00000 n +0000162344 00000 n +0000162480 00000 n +0000162612 00000 n +0000162760 00000 n +0000162916 00000 n +0000163038 00000 n +0000163190 00000 n +0000163346 00000 n +0000163508 00000 n +0000163644 00000 n +0000163762 00000 n +0000163876 00000 n +0000164036 00000 n +0000164162 00000 n +0000164276 00000 n +0000164394 00000 n +0000164508 00000 n +0000164660 00000 n +0000164778 00000 n +0000164890 00000 n +0000165022 00000 n +0000165138 00000 n +0000165282 00000 n +0000165430 00000 n +0000165564 00000 n +0000165694 00000 n +0000165818 00000 n +0000165956 00000 n +0000166074 00000 n +0000166200 00000 n +0000166320 00000 n +0000166436 00000 n +0000166604 00000 n +0000166760 00000 n +0000166906 00000 n +0000167050 00000 n +0000167212 00000 n +0000167340 00000 n +0000167464 00000 n +0000167586 00000 n +0000167722 00000 n +0000167848 00000 n +0000167996 00000 n +0000168132 00000 n +0000168256 00000 n +0000168372 00000 n +0000168516 00000 n +0000168634 00000 n +0000168768 00000 n +0000168908 00000 n +0000169032 00000 n +0000169158 00000 n +0000169261 00000 n +0000169359 00000 n +0000169521 00000 n +0000169647 00000 n +0000169777 00000 n +0000169895 00000 n +0000170017 00000 n +0000170157 00000 n +0000170285 00000 n +0000170405 00000 n +0000011056 00000 n +trailer +<]/Prev 1120129>> +startxref +0 +%%EOF + +4386 0 obj <>stream +h��Y PT�����e�.��g`AD$uqA4>d]֟�B6�$�(%��QҘ� +��\�2���485)��L&YY�DS҂1�4f$�L�L�i��{��3a��t����}w����s�.