forked from outlyerapp/plugins
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpacemaker.py
More file actions
154 lines (134 loc) · 5.66 KB
/
pacemaker.py
File metadata and controls
154 lines (134 loc) · 5.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env python
import subprocess
import socket
import sys
import re
bin_monitor = "/usr/sbin/crm_mon"
bin_status = "/usr/bin/cl_status"
summary = {}
def end(status, message):
if status == "OK":
print "OK: %s" % message
sys.exit(0)
elif status == "WARNING":
print "WARNING: %s" % message
sys.exit(1)
elif status == "CRITICAL":
print "CRITICAL: %s" % message
sys.exit(2)
else:
print "UNKNOWN: %s" % message
sys.exit(3)
def status_info(code, info, nagios_output):
if code in summary:
summary[code] += ", %s" % info
else:
summary[code] = info
if nagios_output:
if "CRITICAL" in summary:
if "WARNING" in summary:
summary['CRITICAL'] += ", %s" % summary['WARNING']
end("CRITICAL", summary["CRITICAL"])
elif "WARNING" in summary:
end("WARNING", summary["WARNING"])
elif "OK" in summary:
end("OK", summary["OK"])
elif "UNKNOWN" in summary:
end("UNKNOWN", summary["UNKNOWN"])
try:
# summary of cluster's current state
cmd_monitor = subprocess.Popen([bin_monitor, "-1rf"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
except OSError as errmsg:
end("UNKNOWN", "%s: %s" % (bin_monitor, errmsg))
else:
for line in cmd_monitor.stdout:
# check cluster connected
if "Connection to cluster failed:" in line:
end("CRITICAL", "Connection to cluster failed")
# check for Designated Controller
if "Current DC:" in line:
# check for Quorum
if "partition with quorum" in line:
status_info("OK", "All resources configured", False)
else:
status_info("CRITICAL", "No quorum", False)
# check offline nodes
m = re.match(r'OFFLINE:\s*\[\s*(\S.*?)\s*\]', line)
if m:
nodes = m.groups()[0]
count = len(nodes.split(" "))
status_info("CRITICAL", "%s node(s) OFFLINE: %s" % (str(count), nodes), False)
# check Master/Slave Stopped
m = re.match(r'\s*Stopped\:\s*\[(.*)\]', line)
if m:
drbd = m.groups()[0]
status_info("CRITICAL", "%s Stopped" % drbd, False)
# check Resources Stopped
m = re.match(r'\s*(\w+)\s+\(\S+\)\:\s+Stopped', line)
if m:
resources = m.groups()[0]
status_info("CRITICAL", "%s Stopped" % resources, False)
# check Resources FAILED
m = re.match(r'\s*(\w+)\s+\(\S+\)\:.+FAILED', line)
if m:
resources = m.groups()[0]
status_info("CRITICAL", "%s FAILED" % resources, False)
# check fail-count
m = re.match(r'\s*(.+)\:\s+migration-threshold=(\d+)\s+fail-count=(\d+)', line)
if m:
resource = m.groups()[0]
threshold = m.groups()[1]
fcounts = m.groups()[2]
# number of critical fail-counts
fccrit = 10
# number of warning fail-counts
fcwarn = 5
if int(fcounts) > fccrit:
status_info("CRITICAL", "%s %s fail-count(s) of %s detected" % (resource, fcounts, threshold), False)
elif int(fcounts) > fcwarn:
status_info("WARNING", "%s %s fail-count(s) of %s detected" % (resource, fcounts, threshold), False)
else:
continue
# check Unmanaged
m = re.match(r'\s*(\w+)\s+\(.+\)\:\s+\w+\s+\S+\s+\(unmanaged\)', line)
if m:
unmanaged = m.groups()[0]
status_info("WARNING", "%s unmanaged" % unmanaged, False)
try:
cmd_hbstatus = subprocess.Popen([bin_status, "hbstatus"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
except OSError as errmsg:
end("UNKNOWN", "%s: %s" % (bin_status, errmsg))
else:
# check to see if heartbeat is running
for line in cmd_hbstatus.stdout:
if "stopped" in line:
end("CRITICAL", "Heartbeat is stopped on this machine")
hblinks = {}
# find only 'normal' type nodes
cmd_listnodes = subprocess.Popen([bin_status, "listnodes", "-n"], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
for node in cmd_listnodes.stdout:
# strip newlines and blanks
node = node.strip()
if node:
# list hostnames except $(hostname)
if socket.gethostname() not in node:
# find network interfaces used as heartbeat links
cmd_listhblinks = subprocess.Popen([bin_status, "listhblinks", node], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
for link in cmd_listhblinks.stdout:
# strip newlines and blanks
link = link.strip()
if link:
# remove duplicated links
node_link = node,link
if node_link not in hblinks:
hblinks[node_link] = True
# check status of a heartbeat link
cmd_hblinkstatus = subprocess.Popen([bin_status, "hblinkstatus", node, link], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
for line in cmd_hblinkstatus.stdout:
if "dead" in line:
status_info("CRITICAL", "dead hblink %s to %s" % (link, node) , False)
elif "up" in line:
status_info("OK", "hblink %s to %s is up" % (link, node), False)
else:
status_info("UNKNOWN", "Something is wrong with hblinks...", False)
status_info("", "", True)