Kubernetes Operator 开发实践:从 CRD 到控制器
Kubernetes Operator 开发实践从 CRD 到控制器引言Kubernetes Operator 是一种软件扩展用于管理复杂的有状态应用。通过自定义资源定义CRD和控制器可以将领域知识编码到 Kubernetes 中实现自动化运维。本文将深入探讨如何开发一个 Kubernetes Operator。Operator 基础概念什么是 OperatorOperator 是一种 Kubernetes 原生应用管理模式其核心思想是声明式管理使用 CRD 定义应用的期望状态自定义控制器监控 CR 并自动调整实际状态领域知识编码将运维经验转化为代码Operator 模式架构┌─────────────────────────────────────────────────────────────────┐ │ Kubernetes API Server │ └───────────────────────────┬───────────────────────────────────┘ │ ┌───────────────────┼───────────────────┐ ▼ ▼ ▼ ┌──────────────────┐ ┌──────────────────┐ ┌──────────────────┐ │ Custom │ │ Controller │ │ Application │ │ Resource │ │ (Operator) │ │ (Managed) │ │ (CR) │ │ │ │ │ └──────────────────┘ └──────────────────┘ └──────────────────┘ │ │ │ └───────────────────┼───────────────────┘ ▼ ┌──────────────────┐ │ Reconcile │ │ Loop │ └──────────────────┘CRD 定义与创建CRD 配置apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: name: databases.example.com spec: group: example.com names: kind: Database listKind: DatabaseList plural: databases singular: database scope: Namespaced versions: - name: v1 served: true storage: true schema: openAPIV3Schema: type: object properties: spec: type: object properties: version: type: string replicas: type: integer storage: type: string required: - version - replicas创建 CRD# 应用 CRD kubectl apply -f database-crd.yaml # 验证 CRD kubectl get crd databases.example.comOperator 控制器开发控制器结构package main import ( context flag os github.com/operator-framework/operator-sdk/pkg/log/zap github.com/spf13/pflag k8s.io/apimachinery/pkg/runtime clientgoscheme k8s.io/client-go/kubernetes/scheme _ k8s.io/client-go/plugin/pkg/client/auth sigs.k8s.io/controller-runtime/pkg/client/config sigs.k8s.io/controller-runtime/pkg/manager sigs.k8s.io/controller-runtime/pkg/manager/signals examplev1 github.com/my-org/my-operator/api/v1 github.com/my-org/my-operator/controllers ) var ( scheme runtime.NewScheme() setupLog log.Log.WithName(setup) ) func init() { _ clientgoscheme.AddToScheme(scheme) _ examplev1.AddToScheme(scheme) } func main() { pflag.CommandLine.AddFlagSet(zap.FlagSet()) pflag.Parse() log : zap.NewLogger() ctx : context.Background() ctx log.IntoContext(ctx) cfg, err : config.GetConfig() if err ! nil { setupLog.Error(err, Unable to get config) os.Exit(1) } mgr, err : manager.New(cfg, manager.Options{Scheme: scheme}) if err ! nil { setupLog.Error(err, Unable to create manager) os.Exit(1) } if err (controllers.DatabaseReconciler{ Client: mgr.GetClient(), Log: log.WithName(controllers).WithName(Database), Scheme: mgr.GetScheme(), }).SetupWithManager(mgr); err ! nil { setupLog.Error(err, Unable to create controller, controller, Database) os.Exit(1) } setupLog.Info(Starting manager) if err : mgr.Start(signals.SetupSignalHandler()); err ! nil { setupLog.Error(err, Manager exited non-zero) os.Exit(1) } }Reconcile 函数实现package controllers import ( context github.com/go-logr/logr k8s.io/apimachinery/pkg/runtime k8s.io/apimachinery/pkg/types ctrl sigs.k8s.io/controller-runtime sigs.k8s.io/controller-runtime/pkg/client examplev1 github.com/my-org/my-operator/api/v1 appsv1 k8s.io/api/apps/v1 corev1 k8s.io/api/core/v1 k8s.io/apimachinery/pkg/api/errors metav1 k8s.io/apimachinery/pkg/apis/meta/v1 ) type DatabaseReconciler struct { client.Client Log logr.Logger Scheme *runtime.Scheme } func (r *DatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { log : r.Log.WithValues(database, req.NamespacedName) var database examplev1.Database if err : r.Get(ctx, req.NamespacedName, database); err ! nil { if errors.IsNotFound(err) { return ctrl.Result{}, nil } log.Error(err, Unable to fetch Database) return ctrl.Result{}, err } var deploy appsv1.Deployment deploy.Name database.Name deploy.Namespace database.Namespace if err : r.Get(ctx, types.NamespacedName{Name: database.Name, Namespace: database.Namespace}, deploy); err ! nil { if errors.IsNotFound(err) { dep : r.deploymentForDatabase(database) log.Info(Creating a new Deployment, Deployment.Namespace, dep.Namespace, Deployment.Name, dep.Name) if err : r.Create(ctx, dep); err ! nil { log.Error(err, Unable to create Deployment, Deployment.Namespace, dep.Namespace, Deployment.Name, dep.Name) return ctrl.Result{}, err } database.Status.ReadyReplicas 0 if err : r.Update(ctx, database); err ! nil { log.Error(err, Unable to update Database status) return ctrl.Result{}, err } return ctrl.Result{Requeue: true}, nil } log.Error(err, Unable to fetch Deployment) return ctrl.Result{}, err } if *deploy.Spec.Replicas ! database.Spec.Replicas { deploy.Spec.Replicas database.Spec.Replicas if err : r.Update(ctx, deploy); err ! nil { log.Error(err, Unable to update Deployment, Deployment.Namespace, deploy.Namespace, Deployment.Name, deploy.Name) return ctrl.Result{}, err } return ctrl.Result{Requeue: true}, nil } if database.Status.ReadyReplicas ! deploy.Status.ReadyReplicas { database.Status.ReadyReplicas deploy.Status.ReadyReplicas if err : r.Update(ctx, database); err ! nil { log.Error(err, Unable to update Database status) return ctrl.Result{}, err } } return ctrl.Result{}, nil } func (r *DatabaseReconciler) deploymentForDatabase(d *examplev1.Database) *appsv1.Deployment { ls : labelsForDatabase(d.Name) replicas : d.Spec.Replicas dep : appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{ Name: d.Name, Namespace: d.Namespace, }, Spec: appsv1.DeploymentSpec{ Replicas: replicas, Selector: metav1.LabelSelector{ MatchLabels: ls, }, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Labels: ls, }, Spec: corev1.PodSpec{ Containers: []corev1.Container{{ Image: postgres: d.Spec.Version, Name: database, ImagePullPolicy: corev1.PullIfNotPresent, Ports: []corev1.ContainerPort{{ ContainerPort: 5432, Name: postgres, }}, }}, }, }, }, } ctrl.SetControllerReference(d, dep, r.Scheme) return dep } func labelsForDatabase(name string) map[string]string { return map[string]string{app: database, database_cr: name} } func (r *DatabaseReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). For(examplev1.Database{}). Owns(appsv1.Deployment{}). Complete(r) }CR 示例与使用创建 CR 实例apiVersion: example.com/v1 kind: Database metadata: name: my-postgres namespace: default spec: version: 14 replicas: 3 storage: 10Gi应用 CR# 应用 CR kubectl apply -f my-postgres.yaml # 查看 CR kubectl get databases # 查看 CR 详情 kubectl describe database my-postgresOperator 部署部署配置apiVersion: apps/v1 kind: Deployment metadata: name: database-operator namespace: operators spec: replicas: 1 selector: matchLabels: name: database-operator template: metadata: labels: name: database-operator spec: serviceAccountName: database-operator containers: - name: database-operator image: my-org/database-operator:latest ports: - containerPort: 60000 name: metrics command: - database-operator imagePullPolicy: AlwaysRBAC 配置apiVersion: v1 kind: ServiceAccount metadata: name: database-operator namespace: operators --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: database-operator rules: - apiGroups: - example.com resources: - databases - databases/status verbs: - * - apiGroups: - apps resources: - deployments verbs: - * - apiGroups: - resources: - pods - services verbs: - * --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: database-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: database-operator subjects: - kind: ServiceAccount name: database-operator namespace: operatorsOperator SDK 使用初始化项目# 安装 Operator SDK curl -LO https://github.com/operator-framework/operator-sdk/releases/download/v1.28.0/operator-sdk_darwin_amd64 chmod x operator-sdk_darwin_amd64 sudo mv operator-sdk_darwin_amd64 /usr/local/bin/operator-sdk # 创建项目 operator-sdk init --domain example.com --repo github.com/my-org/my-operator # 创建 API operator-sdk create api --group example --version v1 --kind Database --resource --controller构建与发布# 构建镜像 make docker-build docker-push IMGmy-org/database-operator:latest # 部署 Operator make install make deploy IMGmy-org/database-operator:latestOperator 最佳实践状态管理// 使用 Finalizer 确保资源清理 func (r *DatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { // 添加 Finalizer if !containsString(database.GetFinalizers(), finalizerName) { database.SetFinalizers(append(database.GetFinalizers(), finalizerName)) if err : r.Update(ctx, database); err ! nil { return ctrl.Result{}, err } } // 处理删除 if database.GetDeletionTimestamp() ! nil { if containsString(database.GetFinalizers(), finalizerName) { if err : r.cleanupResources(ctx, database); err ! nil { return ctrl.Result{}, err } database.SetFinalizers(removeString(database.GetFinalizers(), finalizerName)) if err : r.Update(ctx, database); err ! nil { return ctrl.Result{}, err } } return ctrl.Result{}, nil } }错误处理func (r *DatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { // 重试逻辑 if err ! nil { // 可重试错误设置 Requeue if isRetryableError(err) { return ctrl.Result{Requeue: true, RequeueAfter: time.Second * 30}, nil } // 不可重试错误返回错误 return ctrl.Result{}, err } return ctrl.Result{}, nil }资源限制apiVersion: v1 kind: LimitRange metadata: name: operator-limits namespace: operators spec: limits: - type: Container max: cpu: 1 memory: 512Mi min: cpu: 100m memory: 128Mi常见问题与解决方案问题 1CRD 注册失败排查步骤# 检查 CRD 状态 kubectl get crd databases.example.com -o yaml # 查看 API Server 日志 kubectl logs -n kube-system kube-apiserver-0 | grep CRD解决方案检查 CRD YAML 语法验证 API 版本兼容性确认权限配置问题 2控制器无法获取资源排查步骤# 检查控制器日志 kubectl logs -n operators database-operator-0 # 检查 RBAC 配置 kubectl get clusterrole database-operator -o yaml解决方案验证 RBAC 权限检查 ServiceAccount 配置确认命名空间权限问题 3Reconcile 循环过频排查步骤# 检查控制器日志频率 kubectl logs -n operators database-operator-0 | grep Reconcile # 检查事件 kubectl get events -n operators解决方案使用 RequeueAfter 控制频率避免不必要的更新优化状态检查逻辑总结Kubernetes Operator 提供了一种强大的方式来管理复杂的有状态应用。通过 CRD 和自定义控制器可以将运维知识编码到 Kubernetes 中实现自动化管理。在实际开发中需要注意状态管理、错误处理和资源优化构建稳定可靠的 Operator。参考文献Operator SDK Documentation: https://sdk.operatorframework.io/Kubernetes CRD Documentation: https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definitions/Operator Pattern: https://kubernetes.io/docs/concepts/extend-kubernetes/operator/