目录

k8s对 noschedule taint:node.kubernetes.io/unschedulable 的处理

问题:node.kubernetes.io/unschedulable 给节点设置为不可调度,未能生效

k8s版本:1.20.5

  • 通过kubect taintl命令给节点设置为不可调度,并驱逐pod
1
2
3
kubectl taint nodes node3 node.kubernetes.io/unschedulable=:NoSchedule

kubectl taint nodes node3 node.kubernetes.io/unschedulable=:NoExecute

发现命令执行成功,但node的taints为空,node.kubernetes.io/unschedulable 的 key:effect没有生效。

此时节点仍可调度。

  • 而执行其它的taints标签操作,操作成功并生效
1
2
3
4
5
6
7
8
9

kubectl taint nodes node3 node-role.kubernetes.io/node=:NoSchedule

kubectl taint nodes node3 node-role.kubernetes.io/node=:NoExecute


kubectl taint nodes node3 node-role.kubernetes.io/node=:NoSchedule-

kubectl taint nodes node3 node-role.kubernetes.io/node=:NoExecute-

此时,节点设置为不可调度,并能驱逐pod

node-lifecycle-controller

Kube-controller-manager之node-lifecycle-controller 处理了node的状态信息

well_known_taints

可以看到k8s node 已内置了一些taints key

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
package v1

const (
   // TaintNodeNotReady will be added when node is not ready
   // and removed when node becomes ready.
   TaintNodeNotReady = "node.kubernetes.io/not-ready"

   // TaintNodeUnreachable will be added when node becomes unreachable
   // (corresponding to NodeReady status ConditionUnknown)
   // and removed when node becomes reachable (NodeReady status ConditionTrue).
   TaintNodeUnreachable = "node.kubernetes.io/unreachable"

   // TaintNodeUnschedulable will be added when node becomes unschedulable
   // and removed when node becomes scheduable.
    // 目前设置不可调度用到的taint key
   TaintNodeUnschedulable = "node.kubernetes.io/unschedulable"

   // TaintNodeMemoryPressure will be added when node has memory pressure
   // and removed when node has enough memory.
   TaintNodeMemoryPressure = "node.kubernetes.io/memory-pressure"

   // TaintNodeDiskPressure will be added when node has disk pressure
   // and removed when node has enough disk.
   TaintNodeDiskPressure = "node.kubernetes.io/disk-pressure"

   // TaintNodeNetworkUnavailable will be added when node's network is unavailable
   // and removed when network becomes ready.
   TaintNodeNetworkUnavailable = "node.kubernetes.io/network-unavailable"

   // TaintNodePIDPressure will be added when node has pid pressure
   // and removed when node has enough disk.
   TaintNodePIDPressure = "node.kubernetes.io/pid-pressure"
)

NodeLifecycle

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21

// NewNodeLifecycleController returns a new taint controller.
func NewNodeLifecycleController(
	leaseInformer coordinformers.LeaseInformer,
	podInformer coreinformers.PodInformer,
	nodeInformer coreinformers.NodeInformer,
	daemonSetInformer appsv1informers.DaemonSetInformer,
	kubeClient clientset.Interface,
	nodeMonitorPeriod time.Duration,
	nodeStartupGracePeriod time.Duration,
	nodeMonitorGracePeriod time.Duration,
	podEvictionTimeout time.Duration,
	evictionLimiterQPS float32,
	secondaryEvictionLimiterQPS float32,
	largeClusterThreshold int32,
	unhealthyZoneThreshold float32,
	runTaintManager bool,
) (*Controller, error) {
    // ...
}
    

Taint

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
// The node this Taint is attached to has the "effect" on
// any pod that does not tolerate the Taint.
type Taint struct {
   // Required. The taint key to be applied to a node.
   Key string `json:"key" protobuf:"bytes,1,opt,name=key"`
   // The taint value corresponding to the taint key.
   // +optional
   Value string `json:"value,omitempty" protobuf:"bytes,2,opt,name=value"`
   // Required. The effect of the taint on pods
   // that do not tolerate the taint.
   // Valid effects are NoSchedule, PreferNoSchedule and NoExecute.
   Effect TaintEffect `json:"effect" protobuf:"bytes,3,opt,name=effect,casttype=TaintEffect"`
   // TimeAdded represents the time at which the taint was added.
   // It is only written for NoExecute taints.
   // +optional
   TimeAdded *metav1.Time `json:"timeAdded,omitempty" protobuf:"bytes,4,opt,name=timeAdded"`
}

type TaintEffect string

const (
   // Do not allow new pods to schedule onto the node unless they tolerate the taint,
   // but allow all pods submitted to Kubelet without going through the scheduler
   // to start, and allow all already-running pods to continue running.
   // Enforced by the scheduler.
   TaintEffectNoSchedule TaintEffect = "NoSchedule"
   // Like TaintEffectNoSchedule, but the scheduler tries not to schedule
   // new pods onto the node, rather than prohibiting new pods from scheduling
   // onto the node entirely. Enforced by the scheduler.
   TaintEffectPreferNoSchedule TaintEffect = "PreferNoSchedule"
   // NOT YET IMPLEMENTED. TODO: Uncomment field once it is implemented.
   // Like TaintEffectNoSchedule, but additionally do not allow pods submitted to
   // Kubelet without going through the scheduler to start.
   // Enforced by Kubelet and the scheduler.
   // TaintEffectNoScheduleNoAdmit TaintEffect = "NoScheduleNoAdmit"

   // Evict any already-running pods that do not tolerate the taint.
   // Currently enforced by NodeController.
   TaintEffectNoExecute TaintEffect = "NoExecute"
)

doNoScheduleTaintingPass

注意这里对taints的处理

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
func (nc *Controller) doNoScheduleTaintingPass(nodeName string) error {
   node, err := nc.nodeLister.Get(nodeName)
   if err != nil {
      // If node not found, just ignore it.
      if apierrors.IsNotFound(err) {
         return nil
      }
      return err
   }

   // Map node's condition to Taints.
   var taints []v1.Taint
   for _, condition := range node.Status.Conditions {
      if taintMap, found := nodeConditionToTaintKeyStatusMap[condition.Type]; found {
         if taintKey, found := taintMap[condition.Status]; found {
            taints = append(taints, v1.Taint{
               Key:    taintKey,
               Effect: v1.TaintEffectNoSchedule,
            })
         }
      }
   }
   // 这里,先判断了node.Spec.Unschedulable,只有node.Spec.Unschedulable=true,
    // 才配置了TaintNodeUnschedulable key: "node.kubernetes.io/unschedulable"
   if node.Spec.Unschedulable {
      // If unschedulable, append related taint.
      taints = append(taints, v1.Taint{
         Key:    v1.TaintNodeUnschedulable,
         Effect: v1.TaintEffectNoSchedule,
      })
   }

   // Get exist taints of node.
    // 这里做了过滤处理,获取node内置taintKeyToNodeConditionMap的key对应的taints
   nodeTaints := taintutils.TaintSetFilter(node.Spec.Taints, func(t *v1.Taint) bool {
      // only NoSchedule taints are candidates to be compared with "taints" later
      if t.Effect != v1.TaintEffectNoSchedule {
         return false
      }
      // Find unschedulable taint of node.
      if t.Key == v1.TaintNodeUnschedulable {
         return true
      }
      // Find node condition taints of node.
      _, found := taintKeyToNodeConditionMap[t.Key]
      return found
   })
   // 比较taints, nodeTaints
   taintsToAdd, taintsToDel := taintutils.TaintSetDiff(taints, nodeTaints)
   // If nothing to add not delete, return true directly.
   if len(taintsToAdd) == 0 && len(taintsToDel) == 0 {
      return nil
   }
   if !nodeutil.SwapNodeControllerTaint(nc.kubeClient, taintsToAdd, taintsToDel, node) {
      return fmt.Errorf("failed to swap taints of node %+v", node)
   }
   return nil
}

总结:k8s对节点的TaintNodeUnschedulable taints处理,是需要根据node.Spec.Unschedulable标识,进行过滤处理的。在k8s1.20.5版本测试中,当仅设置node.Spec.Unschedulable为true时,k8s默认会自动添加TaintNodeUnschedulable key: “node.kubernetes.io/unschedulable”

附录

除了taint,还要考虑tolerance容忍度对污点标签taint的影响,可以参考下面文章

k8s 的污点(Taint)和容忍度(Toleration)https://zhuanlan.zhihu.com/p/405348246