From 44f0c1403c9a3e7990ef067ee1d0413a0666e3d0 Mon Sep 17 00:00:00 2001 From: Nicholas Novak Date: Mon, 26 Oct 2020 11:51:35 -0700 Subject: [PATCH 01/16] Fixed some spelling and grammatical errors in the READMEs Signed-off-by: Nicholas Novak --- README.md | 2 +- chart/README.md | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index c5d73a3..1a87dc3 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ Longhorn can be installed on a Kubernetes cluster in several ways: The official Longhorn documentation is [here.](https://longhorn.io/docs) ## Community -Longhorn is an open source software, so contribution are greatly welcome. Please read [Code of Conduct](./CODE_OF_CONDUCT.md) and [Contributing Guideline](./CONTRIBUTING.md) before contributing. +Longhorn is open source software, so contributions are greatly welcome. Please read [Code of Conduct](./CODE_OF_CONDUCT.md) and [Contributing Guideline](./CONTRIBUTING.md) before contributing. Contributing code is not the only way of contributing. We value feedbacks very much and many of the Longhorn features are originated from users' feedback. If you have any feedbacks, feel free to [file an issue](https://github.com/longhorn/longhorn/issues/new?title=*Summarize%20your%20issue%20here*&body=*Describe%20your%20issue%20here*%0A%0A---%0AVersion%3A%20``) and talk to the developers at the [CNCF](https://slack.cncf.io/) [#longhorn](https://cloud-native.slack.com/messages/longhorn) slack channel. diff --git a/chart/README.md b/chart/README.md index b8519eb..afba135 100644 --- a/chart/README.md +++ b/chart/README.md @@ -1,6 +1,6 @@ # Longhorn Chart -> **Important**: Please install Longhorn chart in `longhorn-system` namespace only. +> **Important**: Please install the Longhorn chart in `longhorn-system` namespace only. > **Warning**: Longhorn doesn't support downgrading from a higher version to a lower version. @@ -21,7 +21,7 @@ Longhorn is 100% open source software. Project source code is spread across a nu 4. Make sure `open-iscsi` has been installed in all nodes of the Kubernetes cluster. For GKE, recommended Ubuntu as guest OS image since it contains `open-iscsi` already. ## Installation -1. Add Longhorn char repository. +1. Add Longhorn chart repository. ``` helm repo add longhorn https://charts.longhorn.io ``` @@ -32,11 +32,11 @@ helm repo update ``` 3. Install Longhorn chart. -- With Helm 2, the following command will create `longhorn-system` namespaceand install Longhorn chart together. +- With Helm 2, the following command will create the `longhorn-system` namespace and install the Longhorn chart together. ``` helm install longhorn/longhorn --name longhorn --namespace longhorn-system ``` -- With Helm 3, the following commands will create `longhorn-system` namespace first, then install Longhorn chart. +- With Helm 3, the following commands will create the `longhorn-system` namespace first, then install the Longhorn chart. ``` kubectl create namespace longhorn-system From 138998163725fd1c1400359991f324335545be42 Mon Sep 17 00:00:00 2001 From: Nicholas Novak Date: Mon, 26 Oct 2020 11:54:54 -0700 Subject: [PATCH 02/16] Fixed a grammatical error that I missed in the README Signed-off-by: Nicholas Novak --- chart/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chart/README.md b/chart/README.md index afba135..13e6312 100644 --- a/chart/README.md +++ b/chart/README.md @@ -1,6 +1,6 @@ # Longhorn Chart -> **Important**: Please install the Longhorn chart in `longhorn-system` namespace only. +> **Important**: Please install the Longhorn chart in the `longhorn-system` namespace only. > **Warning**: Longhorn doesn't support downgrading from a higher version to a lower version. From 6c34cef1ef286c72eb44ff8174ea90edb9e2d96f Mon Sep 17 00:00:00 2001 From: Bo Tao Date: Fri, 30 Oct 2020 22:13:28 -0700 Subject: [PATCH 03/16] Add iscsi installtion daemonset yaml file Add iscsi installation yaml file to provide a convenient way to install iscsi on every host. Longhorn #1741 Signed-off-by: Bo Tao --- deploy/iscsi/longhorn-iscsi-installation.yaml | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 deploy/iscsi/longhorn-iscsi-installation.yaml diff --git a/deploy/iscsi/longhorn-iscsi-installation.yaml b/deploy/iscsi/longhorn-iscsi-installation.yaml new file mode 100644 index 0000000..8e2a157 --- /dev/null +++ b/deploy/iscsi/longhorn-iscsi-installation.yaml @@ -0,0 +1,31 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: longhorn-iscsi-installation + labels: + app: longhorn-iscsi-installation + annotations: + command: &cmd OS=$(grep "ID_LIKE" /etc/os-release | cut -d '=' -f 2); if [[ $OS == *"debian"* ]]; then apt-get update -qy && apt-get install -qy open-iscsi && sudo systemctl enable iscsid && sudo systemctl start iscsid; else yum install iscsi-initiator-utils -y && sudo systemctl enable iscsid && sudo systemctl start iscsid; fi && if [ $? -eq 0 ]; then echo "iscsi install successfully"; else echo "iscsi install failed error code " $?; fi +spec: + selector: + matchLabels: + app: longhorn-iscsi-installation + template: + metadata: + labels: + app: longhorn-iscsi-installation + spec: + hostNetwork: true + containers: + - name: iscsi-installation + command: + - nsenter + - --mount=/proc/1/ns/mnt + - -- + - sh + - -c + - *cmd + image: alpine:3.7 + securityContext: + privileged: true + hostPID: true From 24e8c7c0ac14278c150bd570577f8aff95072fe3 Mon Sep 17 00:00:00 2001 From: Shuo Wu Date: Fri, 21 Aug 2020 19:29:33 +0800 Subject: [PATCH 04/16] enhancements: Add LEP 'Rebuild replica with existing data' Longhorn #1304 Signed-off-by: Shuo Wu --- ...0821-rebuild-replica-with-existing-data.md | 140 ++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 enhancements/20200821-rebuild-replica-with-existing-data.md diff --git a/enhancements/20200821-rebuild-replica-with-existing-data.md b/enhancements/20200821-rebuild-replica-with-existing-data.md new file mode 100644 index 0000000..df9d28f --- /dev/null +++ b/enhancements/20200821-rebuild-replica-with-existing-data.md @@ -0,0 +1,140 @@ +# Rebuild replica with existing data + +## Summary +Longhorn could reuse the existing data of failed replicas to speed up rebuild progress as well as save bandwidth. + +### Related Issues +https://github.com/longhorn/longhorn/issues/1304 + +## Motivation +### Goals +1. The (data of) failed replicas can be reused during the replica rebuild. +2. The rebuild won't be blocked when the data of failed replicas are completely corrupted, or there is no existing replica. +3. With the existing data, some of the data transferring can be skipped, and replica rebuild may speed up. + +## Proposal +1. Add a new setting `ReplicaReplenishmentWaitInterval` to delay the replica rebuild. + - If the failed replica currently is unavailable but it may be able to be reused later(we call it potential reusable failed replica), Longhorn may need to delay the new replica replenishment so that there is a chance to reuse this kind of replica. + - For eviction/data locality/new volume cases, a new replica should be recreated immediately hence this setting won't be applied. +2. In order to reuse the existing data, Longhorn can directly reuse the failed replica objects for the rebuild. +3. Add max retry count for the replica rebuild with failed replicas. Otherwise, the rebuild will get stuck of the reusing the failed replicas there if the data of failed replicas are completely corrupted. +4. Add backoff interval for the retry of the failed replica reuse. + +### User Stories +#### Rebuild replica for a large volume after network fluctuation/node reboot +Before the enhancement, there is no chance to reuse the failed replicas on the node, and the rebuild can take a long time with heavy bandwidth usage. + +After the enhancement, the replica rebuild won't start until the new worker nodes with old disks are up. Then the failed replicas will be reused during the rebuild, and the rebuild can be pretty fast. + +### User Experience In Detail +Users don't need to do anything except for setting `ReplicaReplenishmentWaitInterval` + +### API Changes +No API change is required. + +## Design +### Implementation Overview +#### longhorn-manager: +1. Add a setting `ReplicaReplenishmentWaitInterval`. + - This will block the rebuilding when there is a failed replica that is temporarily unavailable in the volume. + - Add a field `volume.Status.LastDegradedAt` so that we can determine if `ReplicaReplenishmentWaitInterval` is passed. +2. Add field `Replica.Spec.RebuildRetryCount` to indicate how many times Longhorn tries to reuse this failed replica for the rebuild. +3. In Volume Controller && Replica Scheduler: + 1. Check if there is a reusable failed replica and if the replica reuse is not in the backoff window. If YES, directly try to reuse the failed replica. + 2. Otherwise, replenish a new replica is required for one of the following cases: + 1. the volume is a new volume (volume.Status.Robustness is Empty) + 2. data locality is required (hardNodeAffinity is not Empty and volume.Status.Robustness is Healthy) + 3. replica eviction happens (volume.Status.Robustness is Healthy) + 4. there is no potential reusable replica + 5. there is a potential reusable replica but the replica replenishment wait interval is passed. + 3. Reuse the failed replica by cleaning up `ReplicaSpec.HealthyAt` and `ReplicaSpec.FailedAt`. And `Replica.Spec.RebuildRetryCount` will be increasd by 1. + 4. Clean up the related record in `Replica.Spec.RebuildRetryCount` when the rebuilding replica becomes mode `RW`. + 5. Guarantee the reused failed replica will be stopped before re-launching it. + +### Test Plan +#### Manually Test Plan +##### Rebuild replica for a large volume after network fluctuation/node reboot +1. Set `ReplicaReplenishmentWaitInterval`. Make sure it's longer than the node recovery interval. +2. Create and attach a large volume. Set a short `staleReplicaTimeout` for the volume, e.g., 1 minute. +3. Write a large amount of data then take a snapshot. +4. Repeat step 3 several times. +5. Reboot/Temporarily disconnect a node contains replica only. +6. According to the `ReplicaReplenishmentWaitInterval` and the node recovery interval: + - Verify the failed replica is reused and there is no new replica for the rebuild after the node recovery. + - Verify the replica rebuild only takes a relatively short time. + +##### Replenish replicas when failed replicas cannot be reused +1. Create and attach a large volume. +2. Write data then take snapshots. +3. Hack into one replica directory and make the directory and files read-only. +4. Crash the related replica process and wait for the replica failure. +5. Wait and check if Longhorn tries to reuse the corrupted replica but always fail. Since there is backoff mechanism, this will take a long time(8 ~ 10min). +6. Check if Longhorn will create a new replica and succeeds to finish the rebuild when the max retry count is reached. +7. Verify the data content. And check if the volume still works fine. + +##### Replenish replicas when failed there is a potential replica and the replenishment wait interval is passed +1. Set `ReplicaReplenishmentWaitInterval` to 60s. +2. Create and attach a large volume. +3. Write data then take snapshots. +4. Shut down a node containing replica only for 60s. +5. Wait and check if Longhorn tries to reuse the failed replica for 2~3 times but always fail. +6. Check if Longhorn will create a new replica once the replenishment wait interval is passed. +7. Verify the data content. And check if the volume still works fine. + +#### Reuse failed replicas for an old degraded volume after live upgrade: +1. Deploy Longhorn v1.0.2. +2. Create and attach a volume. Write data to the volume. +3. Disable scheduling for 1 node. +4. Crash the replica on the node. +5. Upgrade Longhorn to the latest. Verify the volume robustness `Degraded`. +6. Enable scheduling for the node. Verify the failed replica of the existing degraded volume will be reused. +7. Verify the data content, and the volume r/w still works fine. + +#### Failed replicas reusage backoff won't block replica replenishment +1. Deploy the latest Longhorn. +2. Create and attach a volume. Write data to the volume. +3. Update `Replica Replenishment Wait Interval` to 60s. +4. Crash a replica: removing the volume head file and creating a directory with the volume head file name. Then the replica reuse will continuously fail. e.g., `rm volume-head-001.img && mkdir volume-head-001.img` +5. Verify: + 1. There is a backoff interval for the failed replica reuse. + 2. A new replica will be created after (around) 60s despite the failed replica reuse is in backoff. + 3. the data content. + 4. the volume r/w still works fine. + +#### Integration Test Plan +##### Reuse the failed replicas when the replica data is messed up +1. Set a long wait interval for setting `replica-replenishment-wait-interval`. +2. Disable the setting soft node anti-affinity. +3. Create and attach a volume. Then write data to the volume. +4. Disable the scheduling for a node. +5. Mess up the data of a random snapshot or the volume head for a replica. Then crash the replica on the node. + --> Verify Longhorn won't create a new replica on the node for the volume. +6. Update setting `replica-replenishment-wait-interval` to a small value. +7. Verify Longhorn starts to create a new replica for the volume. + Notice that the new replica scheduling will fail. +8. Update setting `replica-replenishment-wait-interval` to a large value. +9. Delete the newly created replica. + --> Verify Longhorn won't create a new replica on the node + for the volume. +10. Enable the scheduling for the node. +11. Verify the failed replica (in step 5) will be reused. +12. Verify the volume r/w still works fine. + +#### Reuse the failed replicas with scheduling check +1. Set a long wait interval for setting `replica-replenishment-wait-interval`. +2. Disable the setting soft node anti-affinity. +3. Add tags for all nodes and disks. +4. Create and attach a volume with node and disk selectors. Then write data to the volume. +5. Disable the scheduling for the 2 nodes (node1 and node2). +6. Crash the replicas on the node1 and node2. + --> Verify Longhorn won't create new replicas on the nodes. +7. Remove tags for node1 and the related disks. +8. Enable the scheduling for node1 and node2. +9. Verify the only failed replica on node2 is reused. +10. Add the tags back for node1 and the related disks. +11. Verify the failed replica on node1 is reused. +12. Verify the volume r/w still works fine. + +### Upgrade strategy +Need to update `volume.Status.LastDegradedAt` for existing degraded volumes during live upgrade. + From 69c1a3eb3e370938215281a1c392a1703b31b6df Mon Sep 17 00:00:00 2001 From: Phan Le Date: Wed, 9 Sep 2020 14:50:48 -0700 Subject: [PATCH 05/16] enhancement: Add LEP for Prometheus support Longhorn#1180 Signed-off-by: Phan Le --- enhancements/20200909-prometheus-support.md | 462 ++++++++++++++++++++ 1 file changed, 462 insertions(+) create mode 100644 enhancements/20200909-prometheus-support.md diff --git a/enhancements/20200909-prometheus-support.md b/enhancements/20200909-prometheus-support.md new file mode 100644 index 0000000..2407c24 --- /dev/null +++ b/enhancements/20200909-prometheus-support.md @@ -0,0 +1,462 @@ +# Prometheus Support + +## Summary + + +We currently do not have a way for users to monitor and alert about events happen in Longhorn such as volume is full, backup is failed, CPU usage, memory consumption. +This enhancement exports Prometheus metrics so that users can use Prometheus or other monitoring systems to monitor Longhorn. + +### Related Issues + +https://github.com/longhorn/longhorn/issues/1180 + +## Motivation + +### Goals + +We are planing to expose 22 metrics in this release: +1. longhorn_volume_capacity_bytes +1. longhorn_volume_actual_size_bytes +1. longhorn_volume_state +1. longhorn_volume_robustness + +1. longhorn_node_status +1. longhorn_node_count_total +1. longhorn_node_cpu_capacity_millicpu +1. longhorn_node_cpu_usage_millicpu +1. longhorn_node_memory_capacity_bytes +1. longhorn_node_memory_usage_bytes +1. longhorn_node_storage_capacity_bytes +1. longhorn_node_storage_usage_bytes +1. longhorn_node_storage_reservation_bytes + +1. longhorn_disk_capacity_bytes +1. longhorn_disk_usage_bytes +1. longhorn_disk_reservation_bytes + +1. longhorn_instance_manager_cpu_usage_millicpu +1. longhorn_instance_manager_cpu_requests_millicpu +1. longhorn_instance_manager_memory_usage_bytes +1. longhorn_instance_manager_memory_requests_bytes + +1. longhorn_manager_cpu_usage_millicpu +1. longhorn_manager_memory_usage_bytes + + + + +See the [User Experience In Detail](#user-experience-in-detail) section for definition of each metric. + +### Non-goals + +We are not planing to expose 6 metrics in this release: +1. longhorn_backup_stats_number_failed_backups +1. longhorn_backup_stats_number_succeed_backups +1. longhorn_backup_stats_backup_status (status for this backup (0=InProgress,1=Done,2=Failed)) +1. longhorn_volume_io_ops +1. longhorn_volume_io_read_throughput +1. longhorn_volume_io_write_throughput + +## Proposal + +### User Stories + +Longhorn already has a great UI with many useful information. +However, Longhorn doesn't have any alert/notification mechanism yet. +Also, we don't have any dashboard or graphing support so that users can have overview picture of the storage system. +This enhancement will address both of the above issues. + +#### Story 1 +In many cases, a problem/issue can be quickly discovered if we have a monitoring dashboard. +For example, there are many times users ask us for supporting and the problems were that the Longhorn engines were killed due to over-use CPU limit. +If there is a CPU monitoring dashboard for instance managers, those problems can be quickly detected. + +#### Story 2 +User want to be notified about abnomal event such as disk space limit approaching. +We can expose metrics provide information about it and user can scrape the metrics and setup alert system. + +### User Experience In Detail + +After this enhancement is merged, Longhorn expose metrics at end point `/metrics` in Prometheus' [text-based format](https://prometheus.io/docs/instrumenting/exposition_formats/). +Users can use Prometheus or other monitoring systems to collect those metrics by scraping the end point `/metrics` in longhorn manager. +Then, user can display the collected data using tools such as Grafana. +User can also setup alert by using tools such as Prometheus Alertmanager. + +Below are the desciptions of metrics which Longhorn exposes and how users can use them: + +1. longhorn_volume_capacity_bytes + + This metric reports the configured size in bytes for each volume which is managed by the current longhorn manager. + + This metric contains 2 labels (dimensions): + * `node`: the node of the longhorn manager which is managing this volume + * `volume`: the name of this volume + + Example of a sample of this metric could be: + ``` + longhorn_volume_capacity_bytes{node="worker-2",volume="testvol"} 6.442450944e+09 + ``` + Users can use this metrics to draw graph about and quickly see the big volumes in the storage system. + +1. longhorn_volume_actual_size_bytes + + This metric reports the actual space used by each replica of the volume on the corresponding nodes + + This metric contains 2 labels (dimensions): + * `node`: the node of the longhorn manager which is managing this volume + * `volume`: the name of this volume + + Example of a sample of this metric could be: + ``` + longhorn_volume_actual_size_bytes{node="worker-2",volume="testvol"} 1.1917312e+08 + ``` + Users can use this metrics to the actual size occupied on disks of Longhorn volumes + +1. longhorn_volume_state + + This metric reports the state of the volume. The states are: 1=creating, 2=attached, 3=Detached, 4=Attaching, 5=Detaching, 6=Deleting. + + This metric contains 2 labels (dimensions): + * `node`: the node of the longhorn manager which is managing this volume + * `volume`: the name of this volume + + Example of a sample of this metric could be: + ``` + longhorn_volume_state{node="worker-3",volume="testvol1"} 2 + ``` + +1. longhorn_volume_robustness + + This metric reports the robustness of the volume. Possible values are: 0=unknown, 1=healthy, 2=degraded, 3=faulted + + This metric contains 2 labels (dimensions): + * `node`: the node of the longhorn manager which is managing this volume + * `volume`: the name of this volume + + Example of a sample of this metric could be: + ``` + longhorn_volume_robustness{node="worker-3",volume="testvol1"} 1 + ``` + +1. longhorn_node_status + + This metric reports the `ready`, `schedulable`, `mountPropagation` condition for the current node. + + This metric contains 3 labels (dimensions): + * `node` + * `condition`: the name of the condition (`ready`, `schedulable`, `mountPropagation`) + * `condition_reason` + + Example of a sample of this metric could be: + ``` + longhorn_node_status{condition="allowScheduling",condition_reason="",node="worker-3"} 1 + longhorn_node_status{condition="mountpropagation",condition_reason="",node="worker-3"} 1 + longhorn_node_status{condition="ready",condition_reason="",node="worker-3"} 1 + longhorn_node_status{condition="schedulable",condition_reason="",node="worker-3"} 1 + ``` + Users can use this metrics to setup alert about node status. + +1. longhorn_node_count_total + + This metric reports the total nodes in Longhorn system. + + Example of a sample of this metric could be: + ``` + longhorn_node_count_total 3 + ``` + Users can use this metric to detect the number of down nodes + +1. longhorn_node_cpu_capacity_millicpu + + Report the maximum allocatable cpu on this node + + Example of a sample of this metric could be: + ``` + longhorn_node_cpu_capacity_millicpu{node="worker-3"} 2000 + ``` + +1. longhorn_node_cpu_usage_millicpu + + Report the cpu usage on this node + + Example of a sample of this metric could be: + ``` + longhorn_node_cpu_usage_millicpu{node="worker-3"} 149 + ``` + +1. longhorn_node_memory_capacity_bytes + + Report the maximum allocatable memory on this node + + Example of a sample of this metric could be: + ``` + longhorn_node_memory_capacity_bytes{node="worker-3"} 4.031217664e+09 + ``` + +1. longhorn_node_memory_usage_bytes + + Report the memory usage on this node + + Example of a sample of this metric could be: + ``` + longhorn_node_memory_usage_bytes{node="worker-3"} 1.643794432e+09 + ``` + +1. longhorn_node_storage_capacity_bytes + + Report the storage capacity of this node + + Example of a sample of this metric could be: + ``` + longhorn_node_storage_capacity_bytes{node="worker-3"} 8.3987283968e+10 + ``` + +1. longhorn_node_storage_usage_bytes + + Report the used storage of this node + + Example of a sample of this metric could be: + ``` + longhorn_node_storage_usage_bytes{node="worker-3"} 9.060212736e+09 + ``` + +1. longhorn_node_storage_reservation_bytes + + Report the reserved storage for other applications and system on this node + + Example of a sample of this metric could be: + ``` + longhorn_node_storage_reservation_bytes{node="worker-3"} 2.519618519e+10 + ``` + +1. longhorn_disk_capacity_bytes + + Report the storage capacity of this disk. + + Example of a sample of this metric could be: + ``` + longhorn_disk_capacity_bytes{disk="default-disk-8b28ee3134628183",node="worker-3"} 8.3987283968e+10 + ``` + +1. longhorn_disk_usage_bytes + + Report the used storage of this disk + + Example of a sample of this metric could be: + ``` + longhorn_disk_usage_bytes{disk="default-disk-8b28ee3134628183",node="worker-3"} 9.060212736e+09 + ``` + +1. longhorn_disk_reservation_bytes + + Report the reserved storage for other applications and system on this disk + + Example of a sample of this metric could be: + ``` + longhorn_disk_reservation_bytes{disk="default-disk-8b28ee3134628183",node="worker-3"} 2.519618519e+10 + ``` + +1. longhorn_instance_manager_cpu_requests_millicpu + + This metric reports the requested CPU resources in Kubernetes of the Longhorn instance managers on the current node. + The unit of this metric is milliCPU. See more about the unit at https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/#cpu-units + + This metric contains 3 labels (dimensions): + * `node` + * `instance_manager` + * `instance_manager_type` + + Example of a sample of this metric could be: + ``` + longhorn_instance_manager_cpu_requests_millicpu{instance_manager="instance-manager-r-61ffe369",instance_manager_type="replica",node="worker-3"} 250 + ``` + +1. longhorn_instance_manager_cpu_usage_millicpu + + This metric reports the CPU usage of the Longhorn instance managers on the current node. + The unit of this metric is milliCPU. See more about the unit at https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/#cpu-units + + This metric contains 3 labels (dimensions): + * `node` + * `instance_manager` + * `instance_manager_type` + + Example of a sample of this metric could be: + ``` + longhorn_instance_manager_cpu_usage_millicpulonghorn_instance_manager_memory_requests_bytes{instance_manager="instance-manager-r-61ffe369",instance_manager_type="replica",node="worker-3"} 0 + ``` + +1. longhorn_instance_manager_memory_requests_bytes + + This metric reports the requested memory in Kubernetes of the Longhorn instance managers on the current node. + + This metric contains 3 labels (dimensions): + * `node` + * `instance_manager` + * `instance_manager_type` + + Example of a sample of this metric could be: + ``` + longhorn_instance_manager_memory_requests_bytes{instance_manager="instance-manager-e-0a67975b",instance_manager_type="engine",node="worker-3"} 0 + ``` + +1. longhorn_instance_manager_usage_memory_bytes + + This metrics reports the memory usage of the Longhorn instance managers on the current node. + + This metric contains 3 labels (dimensions): + * `node` + * `instance_manager` + * `instance_manager_type` + + Example of a sample of this metric could be: + ``` + longhorn_instance_manager_memory_usage_bytes{instance_manager="instance-manager-e-0a67975b",instance_manager_type="engine",node="worker-3"} 1.374208e+07 + ``` + +1. longhorn_manager_cpu_usage_millicpu + + This metric reports the CPU usage of the Longhorn manager on the current node. + The unit of this metric is milliCPU. See more about the unit at https://kubernetes.io/docs/tasks/configure-pod-container/assign-cpu-resource/#cpu-units + + This metric contains 2 labels (dimensions): + * `node` + * `manager` + + Example of a sample of this metric could be: + ``` + longhorn_manager_cpu_usage_millicpu{manager="longhorn-manager-x5cjj",node="phan-cluster-23-worker-3"} 15 + ``` + +1. longhorn_manager_memory_usage_bytes + + This metric reports the memory usage of the Longhorn manager on the current node. + + This metric contains 2 labels (dimensions): + * `node` + * `manager` + + Example of a sample of this metric could be: + ``` + longhorn_manager_memory_usage_bytes{manager="longhorn-manager-x5cjj",node="worker-3"} 2.7979776e+07 + ``` + +### API changes +We add a new end point `/metrics` to exposes all longhorn Prometheus metrics. +## Design + +### Implementation Overview +We follow the [Prometheus best practice](https://prometheus.io/docs/instrumenting/writing_exporters/#deployment), each Longhorn manager reports information about the components it manages. +Prometheus can use service discovery mechanisim to find all longhorn-manager pods in longhorn-backend service. + +We create a new collector for each type (volumeCollector, backupCollector, nodeCollector, etc..) and have a common baseCollector. +This structure is similar to the controller package: we have volumeController, nodeController, etc.. which have a common baseController. +The end result is a structure like a tree: +``` +a custom registry <- many custom collectors share the same base collector <- many metrics in each custom collector +``` +When a scrape request is made to endpoint `/metric`, a handler gathers data in the Longhorn custom registry, which in turn gathers data in custom collectors, which in turn gathers data in all metrics. + +Below are how we collect data for each metric: + +1. longhorn_volume_capacity_bytes + + We get the information about volumes' capacity by reading volume CRD from datastore. + When volume move to a different node, the current longhorn manager stops reporting the vol. + The volume will be reported by a new longhorn manager. + +1. longhorn_actual_size_bytes + + We get the information about volumes' actual size by reading volume CRD from datastore. + When volume move to a different node, the current longhorn manager stops reporting the vol. + The volume will be reported by a new longhorn manager. + +1. longhorn_volume_state + + We get the information about volumes' state by reading volume CRD from datastore. + +1. longhorn_volume_robustness + + We get the information about volumes' robustness by reading volume CRD from datastore. + +1. longhorn_node_status + + We get the information about node status by reading node CRD from datastore. + Nodes don't move likes volume, so we don't have to decide which longhorn manager reports which node. + +1. longhorn_node_count_total + + We get the information about total number node by reading from datastore + +1. longhorn_node_cpu_capacity_millicpu + + We get the information about the maximum allocatable cpu on this node by reading Kubernetes node resource + +1. longhorn_node_cpu_usage_millicpu + + We get the information about the cpu usage on this node from metric client + +1. longhorn_node_memory_capacity_bytes + + We get the information about the maximum allocatable memory on this node by reading Kubernetes node resource + +1. longhorn_node_memory_usage_bytes + + We get the information about the memory usage on this node from metric client + +1. longhorn_node_storage_capacity_bytes + + We get the information by reading node CRD from datastore + +1. longhorn_node_storage_usage_bytes + + We get the information by reading node CRD from datastore + +1. longhorn_node_storage_reservation_bytes + + We get the information by reading node CRD from datastore + +1. longhorn_disk_capacity_bytes + + We get the information by reading node CRD from datastore + +1. longhorn_disk_usage_bytes + + We get the information by reading node CRD from datastore + +1. longhorn_disk_reservation_bytes + + We get the information by reading node CRD from datastore + +1. longhorn_instance_manager_cpu_requests_millicpu + + We get the information by reading instance manager Pod objects from datastore. + +1. longhorn_instance_manager_cpu_usage_millicpu + + We get the information by using kubernetes metric client. + +1. longhorn_instance_manager_memory_usage_bytes + + We get the information by using kubernetes metric client. + +1. longhorn_instance_manager_memory_requests_bytes + + We get the information by reading instance manager Pod objects from datastore. + +1. longhorn_manager_cpu_usage_millicpu + + We get the information by using kubernetes metric client. + +1. longhorn_manager_memory_usage_bytes + + We get the information by using kubernetes metric client. + + +### Test plan + +The manual test plan is detailed at [here](https://github.com/longhorn/longhorn-tests/blob/master/docs/content/manual/release-specific/v1.1.0/prometheus_support.md) + +### Upgrade strategy + +This enhancement doesn't require any upgrade. From 2e14a1c09e8f43a6fcb5d81feaf085adc7b3601f Mon Sep 17 00:00:00 2001 From: Shuo Wu Date: Fri, 6 Nov 2020 20:14:04 +0800 Subject: [PATCH 06/16] enhancement: Add a new enhancement 'disk-reconnection' Longhorn #1269 Signed-off-by: Shuo Wu --- enhancements/20201106-disk-reconnection.md | 104 +++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 enhancements/20201106-disk-reconnection.md diff --git a/enhancements/20201106-disk-reconnection.md b/enhancements/20201106-disk-reconnection.md new file mode 100644 index 0000000..382d928 --- /dev/null +++ b/enhancements/20201106-disk-reconnection.md @@ -0,0 +1,104 @@ +# Disk Reconnection + +## Summary +When disks are reconnected/migrated to other Longhorn nodes, Longhorn should be able to figure out the disk reconnection and update the node ID as well as the data path for the related replicas (including failed replicas). + +### Related Issues +https://github.com/longhorn/longhorn/issues/1269 + +## Motivation +### Goals +The goal of this feature is to reuse the existing data of the failed replica when the corresponding disk is back. + +### Non-Goals +As for how to reuse the existing data and handle rebuild related feature, it is already implemented in #1304, which is not the intention of this enhancement. + +## Proposal +Identifying the disk that is previously used in Longhorn is not the the key point. The essential of this feature is that Longhorn should know where to reuse existing data of all related replicas when the disk is reconnected. +In other words, the fields that indicating the replica data position should be updated when the disk is reconnected. + +### User Stories +#### Migrate the existing disks to new nodes +Before the enhancement, there is no way to reuse the existing data when a disk is reconnected/migrated. + +After the enhancement, this can be done by: +1. detach the volumes using the disk +2. Reconnect the disk to the another node (both nodes keep running) +3. reattach the related volumes + +#### Scale down the node but reuse the disks on the node +Before the enhancement, there is no chance to reuse the failed replicas on the node. + +After the enhancement, Longhorn will update the path and node id for all failed replicas using the disks, then Longhorn can reuse the failed replicas during rebuilding. + +### User Experience In Detail +#### Migrate the existing disks to new nodes +1. Detach all related volumes using the disk before the disk migration. +2. Directly move the disk to the new node (physically or in cloud vendor) and mount the disk. +3. Add the disk with the new mount point to the corresponding new Longhorn node in Longhorn Node page. +4. Attach the volumes for the workloads. + +#### Scale down the node but reuse the disks on the node +1. Directly shut down the node when there are replicas on the node. Then the replicas on the node will fail. +2. Move the disks on the down node to other running nodes (physically or in cloud vendor). +3. Add the disk with the new mount point to the corresponding new Longhorn node in Longhorn Node page. +4. Wait then verify the failed replicas using the disk will be reused, and the node ID & path info will be updated. + +### API Changes +There is no API change. + +## Design +### Implementation Overview +#### longhorn-manager: +1. When a disk is ready, Longhorn can list all related replicas via `replica.Spec.DiskID` then sync up node ID and path info for these replicas. + - If a disk is not ready, the scheduling info will be cleaned up. Longhorn won't be confused of updating replicas if multiple disconnected disks using the same Disk UUID. + - Need to add a disk related label for replicas. +2. Store DiskUUID rather than the disk name in `replica.Spec.DiskID` + - Need to update `DiskID` for existing replicas during upgrade. +3. Since the disk path of a replica may get changed but the data directory name is immutable. It's better to split `replica.Spec.DataPath` to `replica.Spec.DiskPath` and `replica.Spec.DataDirectoryName`. Then it's more convenient to sync up the disk path for replicas. + - Need to update the path fields for existing replicas during upgrade. + +### Test Plan +#### Integration Tests +##### Disk migration +1. Disable the node soft anti-affinity. +2. Create a new host disk. +3. Disable the default disk and add the extra disk with scheduling enabled for the current node. +4. Launch a Longhorn volume with 1 replica. + Then verify the only replica is scheduled to the new disk. +5. Write random data to the volume then verify the data. +6. Detach the volume. +7. Unmount then remount the disk to another path. (disk migration) +8. Create another Longhorn disk based on the migrated path. +9. Verify the Longhorn disk state. + - The Longhorn disk added before the migration should become "unschedulable". + - The Longhorn disk created after the migration should become "schedulable". +10. Verify the replica DiskID and the path is updated. +11. Attach the volume. Then verify the state and the data. + +#### Manual Tests +##### Some Longhorn worker nodes in AWS Auto Scaling group is in replacement +1. Set `ReplicaReplenishmentWaitInterval`. Make sure it's longer than the time needs for node replacement. +2. Launch a Kubernetes cluster with the nodes in AWS Auto Scaling group. Then Deploy Longhorn. +3. Deploy some workloads using Longhorn volumes. +4. Wait for/Trigger the ASG instance replacement. +5. Verify new replicas won't be created before reaching `ReplicaReplenishmentWaitInterval`. +6. Verify the failed replicas are reused after the node recovery. +7. Verify if workloads still work fine with the volumes after the recovery. + +##### Longhorn upgrade with node down and removal +1. Launch Longhorn v1.0.x +2. Create and attach a volume, then write data to the volume. +3. Directly remove a Kubernetes node, and shut down a node. +4. Wait for the related replicas failure. Then record `replica.Spec.DiskID` for the failed replicas. +5. Upgrade to Longhorn master +6. Verify the Longhorn node related to the removed node is gone. +7. Verify + 1. `replica.Spec.DiskID` on the down node is updated and the field of the replica on the gone node is unchanged. + 2. `replica.Spec.DataPath` for all replicas becomes empty. +8. Remove all unscheduled replicas. +9. Power on the down node. Wait for the failed replica on the down node being reused. +10. Wait for a new replica being replenished and available. + +### Upgrade strategy +Need to update disk ID and data path for existing replicas. From 36ad2daf0b8cee1ce64782ca00870f418b764d4e Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Thu, 12 Nov 2020 08:46:07 -0800 Subject: [PATCH 07/16] Update README.md Remove the Astronomer badge since it was archived by the author. Signed-off-by: Sheng Yang --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1a87dc3..096b713 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Longhorn [![Astronomer](https://img.shields.io/endpoint.svg?url=https%3A%2F%2Fastronomer.ullaakut.eu%2Fshields%3Fowner%3Dlonghorn%26name%3Dlonghorn)](https://github.com/Ullaakut/astronomer) +# Longhorn ### Build Status * Engine: [![Build Status](https://drone-publish.longhorn.io/api/badges/longhorn/longhorn-engine/status.svg)](https://drone-publish.longhorn.io/longhorn/longhorn-engine) [![Go Report Card](https://goreportcard.com/badge/github.com/longhorn/longhorn-engine)](https://goreportcard.com/report/github.com/longhorn/longhorn-engine) From 351322ff05716bf66b1d46385a7ddb117e0ff9e1 Mon Sep 17 00:00:00 2001 From: William Jimenez Date: Fri, 13 Nov 2020 18:55:22 -0800 Subject: [PATCH 08/16] Update README.md highlight community events Signed-off-by: William Jimenez --- README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 096b713..eccc930 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ * Test: [![Build Status](http://drone-publish.longhorn.io/api/badges/longhorn/longhorn-tests/status.svg)](http://drone-publish.longhorn.io/longhorn/longhorn-tests) ### Overview -Longhorn is a distributed block storage system for Kubernetes. +Longhorn is a distributed block storage system for Kubernetes. Longhorn is cloud native storage because it is built using Kubernetes and container primatives. Longhorn is lightweight, reliable, and powerful. You can install Longhorn on an existing Kubernetes cluster with one `kubectl apply` command or using Helm charts. Once Longhorn is installed, it adds persistent volume support to the Kubernetes cluster. @@ -23,6 +23,13 @@ Longhorn implements distributed block storage using containers and microservices You can read more technical details of Longhorn [here](https://longhorn.io/). + +## Get Involved +**Community Meeting and Office Hours**!: Hosted by the core maintainers of Longhorn: 2nd Friday of the every month at 09:00 Pacific Time (PT)/12:00 Eastern Time (ET) on Zoom: http://bit.ly/longhorn-community-meeting. Gcal event: http://bit.ly/longhorn-events +**Longhorn Mailing List**!: Stay up to date on the latest news and events: https://lists.cncf.io/g/cncf-longhorn + +You can read more about the community and its events here: https://github.com/longhorn/community + ## Current status The latest release of Longhorn is **v1.0.2**. From 4d52211839d40efabc6d89ed021132ac7eba4fa8 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Thu, 19 Nov 2020 15:48:45 -0800 Subject: [PATCH 09/16] Update bug_report.md Add more questions regarding the node config and underlying infrastructure. Signed-off-by: Sheng Yang --- .github/ISSUE_TEMPLATE/bug_report.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index b39b446..b653681 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -27,7 +27,13 @@ If applicable, add the Longhorn managers' log when the issue happens. **Environment:** - Longhorn version: - Kubernetes version: - - Node OS type and version: + - Node config + - OS type and version + - CPU per node: + - Memory per node: + - Disk type + - Network bandwidth and latency between the nodes: + - Underlying Infrastructure (e.g. on AWS/GCE, EKS/GKE, VMWare/KVM, Baremetal): **Additional context** Add any other context about the problem here. From 90350b1903d5c5f5e199a9fcdcba2fd7761b11d0 Mon Sep 17 00:00:00 2001 From: Phan Le Date: Thu, 3 Dec 2020 16:02:41 -0800 Subject: [PATCH 10/16] Fix crash loop error in longhorn-iscsi-installation We install iscsi in the init container then sleep in the main container. This avoids crash loop after finishing installing iscsi Longhorn #1741 Signed-off-by: Phan Le --- deploy/iscsi/longhorn-iscsi-installation.yaml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/deploy/iscsi/longhorn-iscsi-installation.yaml b/deploy/iscsi/longhorn-iscsi-installation.yaml index 8e2a157..02d201c 100644 --- a/deploy/iscsi/longhorn-iscsi-installation.yaml +++ b/deploy/iscsi/longhorn-iscsi-installation.yaml @@ -16,7 +16,8 @@ spec: app: longhorn-iscsi-installation spec: hostNetwork: true - containers: + hostPID: true + initContainers: - name: iscsi-installation command: - nsenter @@ -28,4 +29,8 @@ spec: image: alpine:3.7 securityContext: privileged: true - hostPID: true + containers: + - name: sleep + image: k8s.gcr.io/pause:3.1 + updateStrategy: + type: RollingUpdate From 3898d17d625676793ae728f2dcfc1c9c7b6245dc Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Sun, 6 Dec 2020 09:22:16 -0800 Subject: [PATCH 11/16] Update README.md Update the build badge for Share Manager. Signed-off-by: Sheng Yang --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index eccc930..1b816d6 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,9 @@ ### Build Status * Engine: [![Build Status](https://drone-publish.longhorn.io/api/badges/longhorn/longhorn-engine/status.svg)](https://drone-publish.longhorn.io/longhorn/longhorn-engine) [![Go Report Card](https://goreportcard.com/badge/github.com/longhorn/longhorn-engine)](https://goreportcard.com/report/github.com/longhorn/longhorn-engine) -* Instance Manager: [![Build Status](http://drone-publish.longhorn.io/api/badges/longhorn/longhorn-instance-manager/status.svg)](http://drone-publish.longhorn.io/longhorn/longhorn-instance-manager)[![Go Report Card](https://goreportcard.com/badge/github.com/longhorn/longhorn-instance-manager)](https://goreportcard.com/report/github.com/longhorn/longhorn-instance-manager) * Manager: [![Build Status](https://drone-publish.longhorn.io/api/badges/longhorn/longhorn-manager/status.svg)](https://drone-publish.longhorn.io/longhorn/longhorn-manager)[![Go Report Card](https://goreportcard.com/badge/github.com/longhorn/longhorn-manager)](https://goreportcard.com/report/github.com/longhorn/longhorn-manager) +* Instance Manager: [![Build Status](http://drone-publish.longhorn.io/api/badges/longhorn/longhorn-instance-manager/status.svg)](http://drone-publish.longhorn.io/longhorn/longhorn-instance-manager)[![Go Report Card](https://goreportcard.com/badge/github.com/longhorn/longhorn-instance-manager)](https://goreportcard.com/report/github.com/longhorn/longhorn-instance-manager) +* Share Manager: [![Build Status](http://drone-publish.longhorn.io/api/badges/longhorn/longhorn-share-manager/status.svg)](http://drone-publish.longhorn.io/longhorn/longhorn-share-manager)[![Go Report Card](https://goreportcard.com/badge/github.com/longhorn/longhorn-share-manager)](https://goreportcard.com/report/github.com/longhorn/longhorn-share-manager) * UI: [![Build Status](https://drone-publish.longhorn.io/api/badges/longhorn/longhorn-ui/status.svg)](https://drone-publish.longhorn.io/longhorn/longhorn-ui) * Test: [![Build Status](http://drone-publish.longhorn.io/api/badges/longhorn/longhorn-tests/status.svg)](http://drone-publish.longhorn.io/longhorn/longhorn-tests) From 12d62cfcf80dbe1935907b2b1a160fdd0adf320c Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Mon, 7 Dec 2020 18:35:55 -0800 Subject: [PATCH 12/16] Update question.md Signed-off-by: Sheng Yang --- .github/ISSUE_TEMPLATE/question.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md index 8488260..783fb15 100644 --- a/.github/ISSUE_TEMPLATE/question.md +++ b/.github/ISSUE_TEMPLATE/question.md @@ -6,5 +6,18 @@ labels: question assignees: '' --- +**Question** +**Environment:** + - Longhorn version: + - Kubernetes version: + - Node config + - OS type and version + - CPU per node: + - Memory per node: + - Disk type + - Network bandwidth and latency between the nodes: + - Underlying Infrastructure (e.g. on AWS/GCE, EKS/GKE, VMWare/KVM, Baremetal): +**Additional context** +Add any other context about the problem here. From cc66f57557c1943c2f222271f38a7aa2814f23f3 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 9 Dec 2020 15:40:50 -0800 Subject: [PATCH 13/16] Update bug_report.md Add information regarding the support bundle. Signed-off-by: Sheng Yang --- .github/ISSUE_TEMPLATE/bug_report.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index b653681..bcc0177 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -23,15 +23,16 @@ A clear and concise description of what you expected to happen. **Log** If applicable, add the Longhorn managers' log when the issue happens. +You can also attach a *Support Bundle* here. You can generate a Support Bundle using the link at the footer of the Longhorn UI. **Environment:** - Longhorn version: - - Kubernetes version: + - Kubernetes distro (e.g. RKE/K3s/EKS/OpenShift) and version: - Node config - - OS type and version + - OS type and version: - CPU per node: - Memory per node: - - Disk type + - Disk type(e.g. SSD/NVMe): - Network bandwidth and latency between the nodes: - Underlying Infrastructure (e.g. on AWS/GCE, EKS/GKE, VMWare/KVM, Baremetal): From 4891c1ef3b8fc58e428b7f9d0a3ce809f7043bd1 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Thu, 17 Dec 2020 18:08:23 -0800 Subject: [PATCH 14/16] Sync with manager commit 278ff44085b967923d6f07dfb43a95a7b2974470 Author: Sheng Yang Date: Thu Dec 17 15:04:10 2020 -0800 Longhorn v1.1.0 release Signed-off-by: Sheng Yang Signed-off-by: Sheng Yang --- deploy/longhorn-images.txt | 6 +++--- deploy/longhorn.yaml | 14 +++++++------- deploy/release-images.txt | 6 +++--- uninstall/uninstall.yaml | 2 +- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/deploy/longhorn-images.txt b/deploy/longhorn-images.txt index 3af2641..d1d2174 100644 --- a/deploy/longhorn-images.txt +++ b/deploy/longhorn-images.txt @@ -1,8 +1,8 @@ -longhornio/longhorn-engine:v1.1.0-rc3 +longhornio/longhorn-engine:master longhornio/longhorn-instance-manager:v1_20201216 longhornio/longhorn-share-manager:v1_20201204 -longhornio/longhorn-manager:v1.1.0-rc3 -longhornio/longhorn-ui:v1.1.0-rc3 +longhornio/longhorn-manager:master +longhornio/longhorn-ui:master longhornio/csi-attacher:v2.2.1-lh1 longhornio/csi-provisioner:v1.6.0-lh1 longhornio/csi-resizer:v0.5.1-lh1 diff --git a/deploy/longhorn.yaml b/deploy/longhorn.yaml index 4d82be6..b22152a 100644 --- a/deploy/longhorn.yaml +++ b/deploy/longhorn.yaml @@ -593,7 +593,7 @@ spec: spec: containers: - name: longhorn-manager - image: longhornio/longhorn-manager:v1.1.0-rc3 + image: longhornio/longhorn-manager:master imagePullPolicy: IfNotPresent securityContext: privileged: true @@ -602,13 +602,13 @@ spec: - -d - daemon - --engine-image - - longhornio/longhorn-engine:v1.1.0-rc3 + - longhornio/longhorn-engine:master - --instance-manager-image - longhornio/longhorn-instance-manager:v1_20201216 - --share-manager-image - longhornio/longhorn-share-manager:v1_20201204 - --manager-image - - longhornio/longhorn-manager:v1.1.0-rc3 + - longhornio/longhorn-manager:master - --service-account - longhorn-service-account ports: @@ -699,7 +699,7 @@ spec: spec: containers: - name: longhorn-ui - image: longhornio/longhorn-ui:v1.1.0-rc3 + image: longhornio/longhorn-ui:master imagePullPolicy: IfNotPresent securityContext: runAsUser: 0 @@ -746,18 +746,18 @@ spec: spec: initContainers: - name: wait-longhorn-manager - image: longhornio/longhorn-manager:v1.1.0-rc3 + image: longhornio/longhorn-manager:master command: ['sh', '-c', 'while [ $(curl -m 1 -s -o /dev/null -w "%{http_code}" http://longhorn-backend:9500/v1) != "200" ]; do echo waiting; sleep 2; done'] containers: - name: longhorn-driver-deployer - image: longhornio/longhorn-manager:v1.1.0-rc3 + image: longhornio/longhorn-manager:master imagePullPolicy: IfNotPresent command: - longhorn-manager - -d - deploy-driver - --manager-image - - longhornio/longhorn-manager:v1.1.0-rc3 + - longhornio/longhorn-manager:master - --manager-url - http://longhorn-backend:9500/v1 env: diff --git a/deploy/release-images.txt b/deploy/release-images.txt index 3af2641..d1d2174 100644 --- a/deploy/release-images.txt +++ b/deploy/release-images.txt @@ -1,8 +1,8 @@ -longhornio/longhorn-engine:v1.1.0-rc3 +longhornio/longhorn-engine:master longhornio/longhorn-instance-manager:v1_20201216 longhornio/longhorn-share-manager:v1_20201204 -longhornio/longhorn-manager:v1.1.0-rc3 -longhornio/longhorn-ui:v1.1.0-rc3 +longhornio/longhorn-manager:master +longhornio/longhorn-ui:master longhornio/csi-attacher:v2.2.1-lh1 longhornio/csi-provisioner:v1.6.0-lh1 longhornio/csi-resizer:v0.5.1-lh1 diff --git a/uninstall/uninstall.yaml b/uninstall/uninstall.yaml index 116b4f6..0ac3806 100644 --- a/uninstall/uninstall.yaml +++ b/uninstall/uninstall.yaml @@ -67,7 +67,7 @@ spec: spec: containers: - name: longhorn-uninstall - image: longhornio/longhorn-manager:v1.1.0-rc3 + image: longhornio/longhorn-manager:master imagePullPolicy: Always command: - longhorn-manager From 1489feee7b0d60582c85b3eda494358ae897ee79 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Thu, 17 Dec 2020 18:09:01 -0800 Subject: [PATCH 15/16] Update version to v1.1.0 Signed-off-by: Sheng Yang --- chart/Chart.yaml | 4 ++-- chart/questions.yml | 6 +++--- chart/values.yaml | 6 +++--- deploy/longhorn-images.txt | 6 +++--- deploy/longhorn.yaml | 14 +++++++------- deploy/release-images.txt | 6 +++--- uninstall/uninstall.yaml | 2 +- 7 files changed, 22 insertions(+), 22 deletions(-) diff --git a/chart/Chart.yaml b/chart/Chart.yaml index 7b26b68..8e15810 100644 --- a/chart/Chart.yaml +++ b/chart/Chart.yaml @@ -1,7 +1,7 @@ apiVersion: v1 name: longhorn -version: 1.1.0-rc3 -appVersion: v1.1.0-rc3 +version: 1.1.0 +appVersion: v1.1.0 kubeVersion: ">=v1.16.0-r0" description: Longhorn is a distributed block storage system for Kubernetes. keywords: diff --git a/chart/questions.yml b/chart/questions.yml index 010cd63..ef33761 100644 --- a/chart/questions.yml +++ b/chart/questions.yml @@ -17,7 +17,7 @@ questions: label: Longhorn Manager Image Repository group: "Longhorn Images Settings" - variable: image.longhorn.manager.tag - default: v1.1.0-rc3 + default: v1.1.0 description: "Specify Longhorn Manager Image Tag" type: string label: Longhorn Manager Image Tag @@ -29,7 +29,7 @@ questions: label: Longhorn Engine Image Repository group: "Longhorn Images Settings" - variable: image.longhorn.engine.tag - default: v1.1.0-rc3 + default: v1.1.0 description: "Specify Longhorn Engine Image Tag" type: string label: Longhorn Engine Image Tag @@ -41,7 +41,7 @@ questions: label: Longhorn UI Image Repository group: "Longhorn Images Settings" - variable: image.longhorn.ui.tag - default: v1.1.0-rc3 + default: v1.1.0 description: "Specify Longhorn UI Image Tag" type: string label: Longhorn UI Image Tag diff --git a/chart/values.yaml b/chart/values.yaml index a5d75e6..d3345d4 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -9,13 +9,13 @@ image: longhorn: engine: repository: longhornio/longhorn-engine - tag: v1.1.0-rc3 + tag: v1.1.0 manager: repository: longhornio/longhorn-manager - tag: v1.1.0-rc3 + tag: v1.1.0 ui: repository: longhornio/longhorn-ui - tag: v1.1.0-rc3 + tag: v1.1.0 instanceManager: repository: longhornio/longhorn-instance-manager tag: v1_20201216 diff --git a/deploy/longhorn-images.txt b/deploy/longhorn-images.txt index d1d2174..25bb23f 100644 --- a/deploy/longhorn-images.txt +++ b/deploy/longhorn-images.txt @@ -1,8 +1,8 @@ -longhornio/longhorn-engine:master +longhornio/longhorn-engine:v1.1.0 longhornio/longhorn-instance-manager:v1_20201216 longhornio/longhorn-share-manager:v1_20201204 -longhornio/longhorn-manager:master -longhornio/longhorn-ui:master +longhornio/longhorn-manager:v1.1.0 +longhornio/longhorn-ui:v1.1.0 longhornio/csi-attacher:v2.2.1-lh1 longhornio/csi-provisioner:v1.6.0-lh1 longhornio/csi-resizer:v0.5.1-lh1 diff --git a/deploy/longhorn.yaml b/deploy/longhorn.yaml index b22152a..2a112c1 100644 --- a/deploy/longhorn.yaml +++ b/deploy/longhorn.yaml @@ -593,7 +593,7 @@ spec: spec: containers: - name: longhorn-manager - image: longhornio/longhorn-manager:master + image: longhornio/longhorn-manager:v1.1.0 imagePullPolicy: IfNotPresent securityContext: privileged: true @@ -602,13 +602,13 @@ spec: - -d - daemon - --engine-image - - longhornio/longhorn-engine:master + - longhornio/longhorn-engine:v1.1.0 - --instance-manager-image - longhornio/longhorn-instance-manager:v1_20201216 - --share-manager-image - longhornio/longhorn-share-manager:v1_20201204 - --manager-image - - longhornio/longhorn-manager:master + - longhornio/longhorn-manager:v1.1.0 - --service-account - longhorn-service-account ports: @@ -699,7 +699,7 @@ spec: spec: containers: - name: longhorn-ui - image: longhornio/longhorn-ui:master + image: longhornio/longhorn-ui:v1.1.0 imagePullPolicy: IfNotPresent securityContext: runAsUser: 0 @@ -746,18 +746,18 @@ spec: spec: initContainers: - name: wait-longhorn-manager - image: longhornio/longhorn-manager:master + image: longhornio/longhorn-manager:v1.1.0 command: ['sh', '-c', 'while [ $(curl -m 1 -s -o /dev/null -w "%{http_code}" http://longhorn-backend:9500/v1) != "200" ]; do echo waiting; sleep 2; done'] containers: - name: longhorn-driver-deployer - image: longhornio/longhorn-manager:master + image: longhornio/longhorn-manager:v1.1.0 imagePullPolicy: IfNotPresent command: - longhorn-manager - -d - deploy-driver - --manager-image - - longhornio/longhorn-manager:master + - longhornio/longhorn-manager:v1.1.0 - --manager-url - http://longhorn-backend:9500/v1 env: diff --git a/deploy/release-images.txt b/deploy/release-images.txt index d1d2174..25bb23f 100644 --- a/deploy/release-images.txt +++ b/deploy/release-images.txt @@ -1,8 +1,8 @@ -longhornio/longhorn-engine:master +longhornio/longhorn-engine:v1.1.0 longhornio/longhorn-instance-manager:v1_20201216 longhornio/longhorn-share-manager:v1_20201204 -longhornio/longhorn-manager:master -longhornio/longhorn-ui:master +longhornio/longhorn-manager:v1.1.0 +longhornio/longhorn-ui:v1.1.0 longhornio/csi-attacher:v2.2.1-lh1 longhornio/csi-provisioner:v1.6.0-lh1 longhornio/csi-resizer:v0.5.1-lh1 diff --git a/uninstall/uninstall.yaml b/uninstall/uninstall.yaml index 0ac3806..b9d1f20 100644 --- a/uninstall/uninstall.yaml +++ b/uninstall/uninstall.yaml @@ -67,7 +67,7 @@ spec: spec: containers: - name: longhorn-uninstall - image: longhornio/longhorn-manager:master + image: longhornio/longhorn-manager:v1.1.0 imagePullPolicy: Always command: - longhorn-manager From 55cfc3482aa8e51e5c9c3860467b2b3d4f98a06b Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Thu, 17 Dec 2020 18:21:15 -0800 Subject: [PATCH 16/16] Remove `examples/rwx` Remove it to prevent misleading users. It should be removed from Longhorn manager as well. Signed-off-by: Sheng Yang --- examples/rwx/01-security.yaml | 85 --------- examples/rwx/02-longhorn-nfs-provisioner.yaml | 178 ------------------ examples/rwx/03-rwx-test.yaml | 59 ------ 3 files changed, 322 deletions(-) delete mode 100644 examples/rwx/01-security.yaml delete mode 100644 examples/rwx/02-longhorn-nfs-provisioner.yaml delete mode 100644 examples/rwx/03-rwx-test.yaml diff --git a/examples/rwx/01-security.yaml b/examples/rwx/01-security.yaml deleted file mode 100644 index e11c910..0000000 --- a/examples/rwx/01-security.yaml +++ /dev/null @@ -1,85 +0,0 @@ -apiVersion: policy/v1beta1 -kind: PodSecurityPolicy -metadata: - name: longhorn-nfs-provisioner -spec: - fsGroup: - rule: RunAsAny - allowedCapabilities: - - DAC_READ_SEARCH - - SYS_RESOURCE - runAsUser: - rule: RunAsAny - seLinux: - rule: RunAsAny - supplementalGroups: - rule: RunAsAny - volumes: - - configMap - - downwardAPI - - emptyDir - - persistentVolumeClaim - - secret - - hostPath ---- -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: longhorn-nfs-provisioner -rules: - - apiGroups: [""] - resources: ["persistentvolumes"] - verbs: ["get", "list", "watch", "create", "delete"] - - apiGroups: [""] - resources: ["persistentvolumeclaims"] - verbs: ["get", "list", "watch", "update"] - - apiGroups: ["storage.k8s.io"] - resources: ["storageclasses"] - verbs: ["get", "list", "watch"] - - apiGroups: [""] - resources: ["events"] - verbs: ["create", "update", "patch"] - - apiGroups: [""] - resources: ["services", "endpoints"] - verbs: ["get"] - - apiGroups: ["extensions"] - resources: ["podsecuritypolicies"] - resourceNames: ["longhorn-nfs-provisioner"] - verbs: ["use"] ---- -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: longhorn-nfs-provisioner -subjects: - - kind: ServiceAccount - name: longhorn-nfs-provisioner - namespace: longhorn-system -roleRef: - kind: ClusterRole - name: longhorn-nfs-provisioner - apiGroup: rbac.authorization.k8s.io ---- -kind: Role -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: leader-locking-longhorn-nfs-provisioner - namespace: longhorn-system -rules: - - apiGroups: [""] - resources: ["endpoints"] - verbs: ["get", "list", "watch", "create", "update", "patch"] ---- -kind: RoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: leader-locking-longhorn-nfs-provisioner - namespace: longhorn-system -subjects: - - kind: ServiceAccount - name: longhorn-nfs-provisioner - namespace: longhorn-system -roleRef: - kind: Role - name: leader-locking-longhorn-nfs-provisioner - apiGroup: rbac.authorization.k8s.io diff --git a/examples/rwx/02-longhorn-nfs-provisioner.yaml b/examples/rwx/02-longhorn-nfs-provisioner.yaml deleted file mode 100644 index 3a8016a..0000000 --- a/examples/rwx/02-longhorn-nfs-provisioner.yaml +++ /dev/null @@ -1,178 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - name: longhorn-nfs-provisioner - namespace: longhorn-system ---- -kind: Service -apiVersion: v1 -metadata: - name: longhorn-nfs-provisioner - namespace: longhorn-system - labels: - app: longhorn-nfs-provisioner -spec: - # hardcode a cluster ip for the service - # so that on delete & recreate of the service the previous pv's still point - # to this nfs-provisioner, pick a new ip for each new nfs provisioner - clusterIP: 10.43.111.111 - ports: - - name: nfs - port: 2049 - - name: nfs-udp - port: 2049 - protocol: UDP - - name: nlockmgr - port: 32803 - - name: nlockmgr-udp - port: 32803 - protocol: UDP - - name: mountd - port: 20048 - - name: mountd-udp - port: 20048 - protocol: UDP - - name: rquotad - port: 875 - - name: rquotad-udp - port: 875 - protocol: UDP - - name: rpcbind - port: 111 - - name: rpcbind-udp - port: 111 - protocol: UDP - - name: statd - port: 662 - - name: statd-udp - port: 662 - protocol: UDP - selector: - app: longhorn-nfs-provisioner ---- -kind: Deployment -apiVersion: apps/v1 -metadata: - name: longhorn-nfs-provisioner - namespace: longhorn-system -spec: - selector: - matchLabels: - app: longhorn-nfs-provisioner - replicas: 1 - strategy: - type: Recreate - template: - metadata: - labels: - app: longhorn-nfs-provisioner - spec: - serviceAccount: longhorn-nfs-provisioner - containers: - - name: longhorn-nfs-provisioner - image: quay.io/kubernetes_incubator/nfs-provisioner:latest - ports: - - name: nfs - containerPort: 2049 - - name: nfs-udp - containerPort: 2049 - protocol: UDP - - name: nlockmgr - containerPort: 32803 - - name: nlockmgr-udp - containerPort: 32803 - protocol: UDP - - name: mountd - containerPort: 20048 - - name: mountd-udp - containerPort: 20048 - protocol: UDP - - name: rquotad - containerPort: 875 - - name: rquotad-udp - containerPort: 875 - protocol: UDP - - name: rpcbind - containerPort: 111 - - name: rpcbind-udp - containerPort: 111 - protocol: UDP - - name: statd - containerPort: 662 - - name: statd-udp - containerPort: 662 - protocol: UDP - securityContext: - capabilities: - add: - - DAC_READ_SEARCH - - SYS_RESOURCE - args: - - "-provisioner=nfs.longhorn.io" - - "-device-based-fsids=false" - env: - - name: POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: SERVICE_NAME - value: longhorn-nfs-provisioner - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - imagePullPolicy: "IfNotPresent" - readinessProbe: - exec: - command: - - ls - - /export - initialDelaySeconds: 5 - periodSeconds: 5 - livenessProbe: - exec: - command: - - ls - - /export - initialDelaySeconds: 5 - periodSeconds: 5 - volumeMounts: - - name: export-volume - mountPath: /export - volumes: - - name: export-volume - persistentVolumeClaim: - claimName: longhorn-nfs-provisioner - # we want really quick failover - terminationGracePeriodSeconds: 30 - tolerations: - - effect: NoExecute - key: node.kubernetes.io/not-ready - operator: Exists - tolerationSeconds: 60 - - effect: NoExecute - key: node.kubernetes.io/unreachable - operator: Exists - tolerationSeconds: 60 ---- -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: longhorn-nfs-provisioner # longhorn backing pvc - namespace: longhorn-system -spec: - storageClassName: longhorn - accessModes: - - ReadWriteOnce - resources: - requests: - storage: "20G" # make this 10% bigger then the workload pvc ---- -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - name: longhorn-nfs # workload storage class -provisioner: nfs.longhorn.io -mountOptions: - - "vers=4.1" - - "noresvport" diff --git a/examples/rwx/03-rwx-test.yaml b/examples/rwx/03-rwx-test.yaml deleted file mode 100644 index d138dea..0000000 --- a/examples/rwx/03-rwx-test.yaml +++ /dev/null @@ -1,59 +0,0 @@ -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: nfs-test - namespace: default -spec: - accessModes: - - ReadWriteMany - storageClassName: longhorn-nfs - resources: - requests: - storage: 1Gi ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: nfs-test - labels: - app: nfs-test - namespace: default -spec: - replicas: 4 - selector: - matchLabels: - app: nfs-test - strategy: - type: Recreate - template: - metadata: - labels: - app: nfs-test - spec: - containers: - - image: ubuntu:xenial - imagePullPolicy: Always - command: ["/bin/sh", "-c"] - args: - - sleep 30; touch /mnt/nfs-test/test.log; while true; do date >> /mnt/nfs-test/test.log; sleep 1; done; - name: nfs-test - stdin: true - tty: true - livenessProbe: - exec: - command: - - timeout - - "10" - - ls - - /mnt/nfs-test - initialDelaySeconds: 10 - periodSeconds: 10 - timeoutSeconds: 10 - volumeMounts: - - mountPath: /mnt/nfs-test - name: nfs-test - restartPolicy: Always - volumes: - - name: nfs-test - persistentVolumeClaim: - claimName: nfs-test