如果你每天都要手动添加几十台服务器到监控系统,或者需要批量修改上百个监控项的阈值,相信很快就会意识到自动化配置的重要性。Zabbix作为企业级监控解决方案,其API正是为这类场景而生的利器。
我刚开始接触Zabbix时,曾花了一整天时间在界面上点击操作,结果不仅效率低下还容易出错。直到发现API这个宝藏功能,才真正体会到什么叫做"解放双手"。通过API,我们可以用代码完成所有配置操作,比如:
Zabbix API采用JSON-RPC 2.0协议,所有请求都是HTTP POST方式发送到/api_jsonrpc.php这个统一入口。它的设计非常规范,大多数对象都支持CRUD操作(create/read/update/delete),比如host、item、trigger这些核心组件。
使用API的第一步就是获取认证令牌(auth token)。这就像进入游乐场需要门票一样,后续所有操作都需要携带这个令牌。Zabbix提供了两种认证方式:
传统用户名密码认证:
python复制import requests
import json
zabbix_url = "http://your_zabbix_server/zabbix/api_jsonrpc.php"
headers = {"Content-Type": "application/json"}
# 认证请求体
auth_data = {
"jsonrpc": "2.0",
"method": "user.login",
"params": {
"user": "Admin", # 默认管理员账号
"password": "your_password"
},
"id": 1 # 请求ID,用于匹配响应
}
response = requests.post(zabbix_url, headers=headers, data=json.dumps(auth_data))
auth_token = response.json().get("result")
print(f"获取到的认证令牌: {auth_token}")
API Token认证(Zabbix 5.4+版本):
python复制api_token = "your_api_token_here"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_token}"
}
注意:生产环境强烈建议使用专用API账号而非Admin账户,并严格控制权限。我曾经因为误操作导致所有监控项被删除,血泪教训!
获取所有主机列表是最常用的操作之一,下面这个例子会返回主机名及其IP地址:
python复制def get_hosts(auth_token):
host_query = {
"jsonrpc": "2.0",
"method": "host.get",
"params": {
"output": ["hostid", "host"],
"selectInterfaces": ["ip"],
"filter": {"status": "0"} # 只查询已启用主机
},
"auth": auth_token,
"id": 2
}
response = requests.post(zabbix_url, headers=headers, data=json.dumps(host_query))
return response.json().get("result", [])
hosts = get_hosts(auth_token)
for host in hosts:
print(f"主机名: {host['host']}, IP: {host['interfaces'][0]['ip']}")
当需要监控大量服务器时,手动添加简直是噩梦。这是我常用的批量创建方法:
python复制def create_hosts(auth_token, host_list):
create_payload = {
"jsonrpc": "2.0",
"method": "host.create",
"params": {
"host": "", # 主机名
"interfaces": [{
"type": 1, # 1=agent接口
"main": 1,
"useip": 1,
"ip": "", # IP地址
"dns": "",
"port": "10050"
}],
"groups": [{
"groupid": "2" # Linux servers组
}],
"templates": [{
"templateid": "10001" # Template OS Linux
}]
},
"auth": auth_token,
"id": 3
}
results = []
for host_info in host_list:
create_payload["params"]["host"] = host_info["name"]
create_payload["params"]["interfaces"][0]["ip"] = host_info["ip"]
response = requests.post(zabbix_url, headers=headers, data=json.dumps(create_payload))
results.append(response.json())
return results
# 示例主机列表
new_hosts = [
{"name": "web-server-01", "ip": "192.168.1.10"},
{"name": "db-server-01", "ip": "192.168.1.20"}
]
create_results = create_hosts(auth_token, new_hosts)
计划性维护时,可以批量设置主机进入维护模式:
python复制def set_maintenance(auth_token, hostids, maintenance=True):
maintenance_payload = {
"jsonrpc": "2.0",
"method": "host.massupdate",
"params": {
"hosts": [{"hostid": hid} for hid in hostids],
"status": 1 if maintenance else 0
},
"auth": auth_token,
"id": 4
}
response = requests.post(zabbix_url, headers=headers,
data=json.dumps(maintenance_payload))
return response.json()
监控项(Item)是Zabbix的最小监控单元。创建CPU使用率监控的示例:
python复制def create_cpu_item(auth_token, hostid):
item_data = {
"jsonrpc": "2.0",
"method": "item.create",
"params": {
"name": "CPU utilization",
"key_": "system.cpu.util[,idle]", # 监控空闲CPU
"hostid": hostid,
"type": 0, # Zabbix agent
"value_type": 0, # 浮点数
"interfaceid": "1", # 主机接口ID
"delay": "1m", # 1分钟间隔
"history": "7d", # 历史数据保留7天
"trends": "365d", # 趋势数据保留1年
"units": "%",
"applications": ["5"] # CPU应用集
},
"auth": auth_token,
"id": 5
}
response = requests.post(zabbix_url, headers=headers,
data=json.dumps(item_data))
return response.json()
当需要为多台主机配置相同监控项时,可以使用模板关联:
python复制def link_template(auth_token, hostids, templateid):
template_link = {
"jsonrpc": "2.0",
"method": "host.massadd",
"params": {
"hosts": [{"hostid": hid} for hid in hostids],
"templates": [{"templateid": templateid}]
},
"auth": auth_token,
"id": 6
}
response = requests.post(zabbix_url, headers=headers,
data=json.dumps(template_link))
return response.json()
对于动态环境,自动发现(LLD)是更好的选择。创建磁盘空间监控发现规则:
python复制def create_disk_discovery_rule(auth_token, hostid):
lld_rule = {
"jsonrpc": "2.0",
"method": "discoveryrule.create",
"params": {
"name": "Disk discovery",
"key_": "vfs.fs.discovery",
"hostid": hostid,
"type": 0,
"interfaceid": "1",
"delay": "1h",
"lifetime": "30d",
"filter": {
"evaltype": 0, # 和/或
"conditions": [
{
"macro": "{#FSNAME}",
"value": "^(/|/boot|/home)$",
"operator": 8 # 正则匹配
}
]
}
},
"auth": auth_token,
"id": 7
}
response = requests.post(zabbix_url, headers=headers,
data=json.dumps(lld_rule))
return response.json()
触发器(Trigger)用于定义异常条件。创建CPU过载触发器:
python复制def create_cpu_trigger(auth_token, hostid):
trigger_data = {
"jsonrpc": "2.0",
"method": "trigger.create",
"params": {
"description": "High CPU load on {HOST.NAME}",
"expression": "last(/Linux server/system.cpu.util[,idle])<10", # 空闲CPU<10%
"priority": 3, # 一般严重性
"comments": "Trigger when CPU idle less than 10% for 5 minutes",
"tags": [
{
"tag": "component",
"value": "cpu"
}
]
},
"auth": auth_token,
"id": 8
}
response = requests.post(zabbix_url, headers=headers,
data=json.dumps(trigger_data))
return response.json()
对于复杂场景,可以设置触发器依赖关系:
python复制def create_dependent_trigger(auth_token):
trigger_data = {
"jsonrpc": "2.0",
"method": "trigger.create",
"params": {
"description": "Application unavailable (depends on host)",
"expression": "last(/App Server/net.tcp.service[http,,80])=0",
"dependencies": [
{
"triggerid": "1345" # 主机不可用触发器的ID
}
]
},
"auth": auth_token,
"id": 9
}
response = requests.post(zabbix_url, headers=headers,
data=json.dumps(trigger_data))
return response.json()
这是我实际项目中用过的Excel批量导入方案:
python复制import pandas as pd
def import_hosts_from_excel(auth_token, excel_file):
# 读取Excel文件
df = pd.read_excel(excel_file)
# 准备批量创建数据
hosts_to_create = []
for _, row in df.iterrows():
host = {
"host": row["hostname"],
"interfaces": [{
"type": 1,
"main": 1,
"useip": 1,
"ip": row["ip"],
"port": "10050"
}],
"groups": [{"groupid": str(row["groupid"])}],
"templates": [{"templateid": "10001"}],
"inventory_mode": 0 # 自动填充资产信息
}
hosts_to_create.append(host)
# 分批处理(每次20台防止超时)
batch_size = 20
results = []
for i in range(0, len(hosts_to_create), batch_size):
batch = hosts_to_create[i:i+batch_size]
batch_payload = {
"jsonrpc": "2.0",
"method": "host.create",
"params": batch,
"auth": auth_token,
"id": 10
}
response = requests.post(zabbix_url, headers=headers,
data=json.dumps(batch_payload))
results.extend(response.json().get("result", {}).get("hostids", []))
return results
建议对所有API调用进行统一封装处理:
python复制def call_zabbix_api(method, params, auth_token=None, id=1):
payload = {
"jsonrpc": "2.0",
"method": method,
"params": params,
"id": id
}
if auth_token:
payload["auth"] = auth_token
try:
response = requests.post(zabbix_url, headers=headers,
data=json.dumps(payload), timeout=30)
response.raise_for_status()
result = response.json()
if "error" in result:
error = result["error"]
raise Exception(f"API错误: {error['message']} (代码: {error['code']})")
return result.get("result")
except requests.exceptions.RequestException as e:
raise Exception(f"请求失败: {str(e)}")
我在实际项目中最深刻的教训是:永远要先在测试环境验证API脚本,特别是删除操作。有次误删了生产环境的所有触发器,不得不从备份恢复,那晚的加班记忆犹新。