nomad编排容器

# vscode扩展
hashicorp.hcl
fredwangwang.vscode-hcl-format

docs

# 推荐直接下载二进制运行
# https://releases.hashicorp.com/nomad/
wget https://releases.hashicorp.com/nomad/1.6.2/nomad_1.6.2_linux_amd64.zip
  • 配置 consul

# 不是必须的, 如果是单节点可以使用nomad, 只有多节点使用consul, 可以使用自带的 provider = "nomad" 配合 range nomadService
wget https://releases.hashicorp.com/consul/1.16.2/consul_1.16.2_linux_amd64.zip

https://developer.hashicorp.com/consul/docs/agent#configuring-consul-agents
# 提示异常:  dial tcp 127.0.0.1:8500: connect: connection refused
# plan提示: Constraint ${attr.consul.version} semver >= 1.7.0 filtered 1 node
# 默认 consul 块将自动与所有 Nomad 代理配置合并, 如果在系统上检测到 Consul,这些合理的默认值会自动启用 Consul 集成. 节点发现等
# [必须运行]是使用了consul驱动, 通信异常

### 默认无密码
GET http://127.0.0.1:8500/v1/catalog/services
# ui界面
http://127.0.0.1:8500/ui/
# 配置文件
/etc/consul.hcl
# consul 的 acl: https://developer.hashicorp.com/consul/docs/security/acl/tokens

node_name = "consul-server"
server    = true
bootstrap = true
ui_config {
  enabled = true
}
datacenter = "dc1"
data_dir   = "/consul/data"
log_level  = "INFO"
addresses {
  http = "0.0.0.0"
}
connect {
  enabled = true
}

acl = {
  enabled = true
  default_policy = "deny"
  enable_token_persistence = true
}
# 是公网ip, 对外暴露的地址, 或者主网卡的内网地址(需要在 ip a 看得到的ip)
bind_addr = "172.26.156.175"

# 注意 consul agent -dev 只能本地可以访问, 不能持久化数据, 正式环境使用 -server;
sudo consul agent -config-file=/etc/consul.hcl



# 获取临时 token(SecretID)
consul acl bootstrap
export CONSUL_HTTP_TOKEN=ba1f15c5-474b-7a6b-0e5c-689470913096
# 查看节点
consul members
# 查看token列表, 需要创建一个Local: true的token
consul acl token list
# 创建
consul acl policy list
consul acl token create -policy-id global-management -local -description "my token" -node-identity "main:main"

consul acl token delete -accessor-id=89ea3fba-7727-14b0-1dd3-afa5fd8983ed

# AccessorID:       89ea3fba-7727-14b0-1dd3-afa5fd8983ed
# SecretID:         286285b7-b94b-286c-aed0-99f6071c951a
# Description:      my token
# Local:            true
# Create Time:      2023-10-11 13:51:53.849210887 +0800 CST
# Node Identities:
#    main (Datacenter: main)

# consul 会根据服务的健康检查进行健康检查
consul catalog datacenters
consul catalog services -tags
consul services deregister -id=xx

# 权限策略
consul acl policy list
# 策略创建/更新
# Permission denied: anonymous token lacks permission 'agent:read'
consul acl policy create -name "anonymous-readagent" -description "匿名读agent" -rules 'agent_prefix "" { policy = "read" }'
consul acl policy update -name "anonymous-readagent" -description "匿名读agent" -rules 'agent_prefix "" { policy = "read" }'
consul acl policy update -name "anonymous-readagent" -description "匿名读agent" \
  -rules 'agent_prefix "" { policy = "read" }, service_prefix "" { policy = "write" }'

# 查看
consul acl policy read -name anonymous-readagent
# 给token增加策略
consul acl token update -accessor-id anonymous \
  -policy-name "anonymous-readagent"
  • 运行服务

# 版本
nomad --version
# Nomad v1.2.8

# 命令 Must specify either server, client or dev mode for the agent.
systemctl status nomad
# 实际运行的命令, 配置支持conf/json/hcl, 推荐hcl
sudo nomad agent -config=/etc/nomad.hcl
# 配置文档: https://developer.hashicorp.com/nomad/docs/configuration
# 起来server和client
# /etc/nomad.hcl
data_dir  = "/var/lib/nomad"
server {
  enabled = true
  bootstrap_expect = 1
}
client {
  enabled = true
}
# 低版本的docker需要配置特权模式, 并在task.config设置 privileged = true
plugin "docker" {
  config {
    allow_privileged = true
  }
}
consul {
  address = "127.0.0.1:8500"
  token = "3354dd49-c77b-39c5-b9fb-cff01b21521d"
}
# 需要 配置 acl 授权验证



# 管理界面,默认端口4646, 会跟随api同时启动
nomad ui -show-url
# URL for web UI: http://127.0.0.1:4646

# 如果服务异常直接删除data_dir的数据, 比如提示Duplicate client-id
https://support.hashicorp.com/hc/en-us/articles/7922521461651-Duplicate-client-id
  • 管理

# 节点
nomad node-status
# ID        DC   Name    Class   Drain  Eligibility  Status
# e039952a  dc1  master  <none>  false  eligible     ready

# 服务端
nomad server members
# Name           Address       Port  Status  Leader  Protocol  Build  Datacenter  Region
# master.global  172.28.4.133  4648  alive   true    2         1.2.8  dc1         global

# job状态列表
nomad job status
# ID          Type     Priority  Status   Submit Date
# example     service  50        running  2023-03-23T10:08:35+08:00
# httpserver  service  50        running  2023-03-23T11:05:22+08:00
  • 部署配置生成

# job的方式进行部署
# 文档: https://www.nomadproject.io/docs/job-specification/job
nomad init
# 默认是用driver = "docker"的方式部署的,可以修改为podman
# 推荐service name和task name一致
#
nomad run example.nomad
nomad status example

nomad logs 883269bf redis

nomad alloc restart 883269bf redis
# 删除job
nomad job stop -purge redis

# 更新
nomad job plan nginx.nomad
nomad job run nginx.nomad

# Failed to pull `nginx:latest`: Get https://registry-1.docker.io/v2/: net/http: request canceled while waiting for connection (Client.Timeout exceeded while awaiting headers)
# https://github.com/moby/moby/issues/22635#issuecomment-224708869
# 验证服务器是否可用
curl https://registry-1.docker.io/v2/
curl https://registry.jihulab.com/v2
cat /etc/resolv.conf
  • 加入节点(servers)

# 注意节点的名称需要唯一
nomad server join [options] <addr> [<addr>...]
# 默认服务端口是4648
nomad server join 10.0.0.8:4648
  • job使用, 包含几个重要配置

# 通常是: job->group->n*task

# 分组
# https://developer.hashicorp.com/nomad/docs/job-specification/group

# 健康检查
# https://developer.hashicorp.com/nomad/docs/job-specification/check

# 自动缩放
# https://developer.hashicorp.com/nomad/docs/job-specification/scaling

# 更新策略
# https://developer.hashicorp.com/nomad/docs/job-specification/update
# 参考 NGINX: https://developer.hashicorp.com/nomad/tutorials/load-balancing/load-balancing-nginx

# 注意如果版本太低,部分template模板语法不支持
# 当服务副本发生变化,服务会重载的对于nomadService的数据
# 镜像使用带tag的镜像,否则默认的latest会每次都删除再拉,网络问题导致失败
  • template模板

# Nomad v1.4.4 可用
# {{加{{-可以去掉前后的空格和换行

      template {
        data = <<EOH

:8080 {
	reverse_proxy * {
		to {{range nomadService "httpserver" }}http://{{ .Address }}:{{ .Port }} {{end }}
		# 负载均衡策略
		# lb_policy ip_hash
		# 检查检查地址/状态/检查间隔时间
		# health_uri /health_uri
		# health_status 200
		# health_interval 500ms
	}
}



EOH
  • 关于升级

# 就是配置文件的data_dir目录,如果升级失败,直接删除重来
/var/lib/nomad

# 升级之后文件报错, 需要重新job init创建配置文件
${attr.consul.version} semver >= 1.7.0



# 查看文件系统
nomad job status
nomad job status NginxLoadBalancer
nomad alloc fs 2c0ddc48 taskNginxLoadBalancer/local/load-balancer.conf
# 或者
find /var/lib/nomad/alloc/|grep load-balancer.conf

nomad alloc logs 2c0ddc48
  • 访问测试

# linux
while true; do curl -sIL -w "%{http_code}\n" -o /dev/null http://192.168.122.204:8080/; sleep 0.1; done

# powershell
1..10000 | foreach { sleep 0.1 && curl -sIL -w "%{http_code}\n" -o /dev/null http://192.168.122.204:8080/ }
  • quick start webserver

job "web-service" {
  # 低版本需要指定DC: nomad node status
  datacenters = ["*"]

  group "web" {
    count = 2

    network {
      port "http" {
        to = 80
      }
    }

    service {
      # 正式
      tags        = ["release_tag", "${var.imageurl}"]
      # 标记金丝雀
      canary_tags = ["canary_tag"]

      name = "serviceweb"
      port = "http"
      provider = "consul"

      check {
        type     = "http"
        path     = "/"
        interval = "2s"
        timeout  = "2s"
      }
    }

    task "web-server" {
      driver = "docker"

      config {
        image = "jcleng/adminer:latest"
        ports = ["http"]
      }
      resources {
        cpu    = 1500 #  MHz
        memory = 800 # MB
      }
    }
  }
  # 更新策略
  update {
    # 并行执行的最大更新次数
    max_parallel = 1
    # 最小健康时间
    min_healthy_time = "30s"
    # 健康截止日期
    healthy_deadline = "5m"
    # 被标记为健康的最后期限
    progress_deadline = "15m"
    # 回滚
    auto_revert = true
    # 灰度数量
    canary = 1
  }
}

  • NginxLoadBalancer


job "NginxLoadBalancer" {
  datacenters = ["dc1"]

  group "groupNginxLoadBalancer" {
    count = 1

    network {
      # 端口映射
      port "http" {
        # 主机
        static = "80"
        # 容器
        to     = "80"
      }
      # port "https" {
      #   static = "443"
      #   to     = "443"
      # }
    }

    service {
      name = "serviceNginxLoadBalancer"
      port = "http"
      # 服务注册提供程序 nomad或者consul
      # 分别对应 nomadService 和 service, 在使用range的时候需要区分
      # 否则会提示: Missing: health.service(服务名称|passing)
      provider = "consul"
    }

    task "taskNginxLoadBalancer" {
      driver = "docker"

      config {
        # 镜像使用gitlab的私有镜像配置auth, 密码使用访问令牌token
        # 注意不要使用latest, 否则每次重启都会拉取镜像
        image = "nginx:v1"
        privileged = false
        # 容器内端口, 只能是定义的别名
        ports = ["http"]
        volumes = [
          "local:/etc/nginx/conf.d",
        ]
      }
      resources {
        cpu    = 1500 #  MHz
        memory = 100
      }
      # 查看服务列表: nomad job status
      # service.name 的 名称
      # 模板
      # https://github.com/hashicorp/consul-template#multiple-commands https://github.com/hashicorp/nomad/issues/8137
      template {
        data = <<EOF

upstream backend {
{{ range service "release_tag.serviceweb" }}
  server {{ .Address }}:{{ .Port }};
{{ else }}
  server 127.0.0.1:65535; # force a 502
{{ end }}

}

upstream canary {
{{ range service "canary_tag.serviceweb" }}
  server {{ .Address }}:{{ .Port }};
{{ else }}
  server 127.0.0.1:65535; # force a 502
{{ end }}

}

server {
   listen 80;

   location / {
      # 灰度: X-Forwarded-For:canary
      if ($http_x_forwarded_for = "canary") {
        proxy_pass http://canary;
      }
      proxy_pass http://backend;
   }
}
EOF

        destination   = "local/load-balancer.conf"
        change_mode   = "signal"
        change_signal = "SIGHUP"
      }
    }
  }
}



  • 配置 acl 授权验证

# 配置
acl {
  enabled = true
}

nomad status
# Error querying jobs: Unexpected response code: 403 (Permission denied)
nomad acl bootstrap
export NOMAD_TOKEN=e597d2fc-1e9f-3bde-be25-5220f3b0b274
nomad status
# ID                 Type     Priority  Status   Submit Date
# NginxLoadBalancer  service  50        running  2023-10-10T15:08:44+08:00
# web-service        service  50        running  2023-10-10T15:13:06+08:00

# web UI
nomad ui -authenticate -show-url
  • 远程连接

docker run --rm \
-e NOMAD_ADDR=http://www.leng2011.icu:4646 \
-e NOMAD_TOKEN=46605821-xxxxxx-4ff77aeaad2a \
--name=nomad docker.io/hashicorp/nomad:latest job status


# 在Nomad中使用 nomad job run 命令运行一个作业时,如果存在灰度发布,命令会等待直到灰度发布完成才会停止。
  • hcl 中声明变量, 从命令行传入

variable "imageurl" {
  description = "镜像地址"
  default     = "registry.jihulab.com/jcleng/imgsite:latest"
}

# hcl使用
image = "${var.imageurl}"

# 命令行传入
nomad job plan -var imageurl=$NEW_IMG dep.hcl
  • java

    task "java" {
      # java使用chroot运行的, java可执行文件变量环境映射到指定目录: https://developer.hashicorp.com/nomad/docs/drivers/java#chroot
      driver = "java"
      config {
        jar_path    = "local/my-project-0.0.1-SNAPSHOT.jar"
        jvm_options = ["-Xmx202m", "-Xms200m"]
      }
      artifact {
        source = "http://192.168.20.153:7777/my-project-0.0.1-SNAPSHOT.jar"
        options {
          checksum = "md5:875cb9b0899470b3c374e3666f3d6b93"
        }
      }
    }
  • 缩放策略

  • traefikLoadBalancer 会自动监听文件变化, 无需配置change_mode/change_signal

      config {
        image      = "traefik:v2.2"
        privileged = true
        network_mode = "host"
        volumes = [
          "local/traefik-file-rule.yml:/etc/traefik/traefik-file-rule.yml",
          "local/traefik.yml:/etc/traefik/traefik.yml",
        ]
      }




      template {
        data = <<EOF
tls:
  certificates:
    - certFile: /etc/nginx/crt/ssl.crt
      keyFile: /etc/nginx/crt/key.txt
http:
  routers:
    myMasterServices:
      # 访问/
      rule: "PathPrefix(`/`)"
      service: myWeb
      # 加入TLS
      tls: {}

  services:
    myWeb:
      weighted:
        services:
        - name: releaseWeb
          weight: 3
        {{if nomadService "canary.serviceweb"}}
        - name: canaryWeb
          weight: 1
        {{end}}

    {{if nomadService "release.serviceweb"}}
    releaseWeb:
      loadBalancer:
        servers:
        {{ range nomadService "release.serviceweb" }}
        - url: http://{{ .Address }}:{{ .Port }}
        {{ else }}
          url: http://0.0.0.0:1024
        {{ end }}
    {{end}}
    {{if nomadService "canary.serviceweb"}}
    canaryWeb:
      loadBalancer:
        servers:
        {{ range nomadService "canary.serviceweb" }}
        - url: http://{{ .Address }}:{{ .Port }}
        {{ else }}
          url: http://0.0.0.0:1024
        {{ end }}
    {{end}}


EOF

        destination = "local/traefik-file-rule.yml"
      }
      template {
        data = <<EOF
entryPoints:
  web:
    address: :80

  # websecure:
  #   address: :443

api:
  insecure: true
  dashboard: true

providers:
  # 文件配置示例: https://doc.traefik.io/traefik/providers/file/#filename
  file:
    filename: /etc/traefik/traefik-file-rule.yml

EOF

        destination = "local/traefik.yml"
      }
# 语法
https://pkg.go.dev/text/template

# if 语句
{{if nomadService "release.serviceweb"}}
{{end}}

# range遍历语句
{{ range nomadService "canary.serviceweb" }}
- url: http://{{ .Address }}:{{ .Port }}
{{ else }}
  url: http://0.0.0.0:1024
{{ end }}