PNG  IHDR;IDATxܻn0K )(pA 7LeG{ §㻢|ذaÆ 6lذaÆ 6lذaÆ 6lom$^yذag5bÆ 6lذaÆ 6lذa{ 6lذaÆ `}HFkm,mӪôô! x|'ܢ˟;E:9&ᶒ}{v]n&6 h_tڠ͵-ҫZ;Z$.Pkž)!o>}leQfJTu іچ\X=8Rن4`Vwl>nG^is"ms$ui?wbs[m6K4O.4%/bC%t Mז -lG6mrz2s%9s@-k9=)kB5\+͂Zsٲ Rn~GRC wIcIn7jJhۛNCS|j08yiHKֶۛkɈ+;SzL/F*\Ԕ#"5m2[S=gnaPeғL lذaÆ 6l^ḵaÆ 6lذaÆ 6lذa; _ذaÆ 6lذaÆ 6lذaÆ RIENDB` import logging import os try: import psutil except: psutil = None import math import re import agent_util DOCKER_SOCKET = '/var/run/docker.sock' NANOSECONDS = 1000000000 CLOCK_TICKS = 100 class DockerPlugin(agent_util.Plugin): textkey = "docker" label = "Docker" ######################################################### # Metadata # ######################################################### @classmethod def is_cgroups_v2(self): return os.path.isfile('/sys/fs/cgroup/cgroup.controllers') @classmethod def get_metadata(self, config): status = agent_util.SUPPORTED if not agent_util.which("docker"): self.log.info("docker not present") status = agent_util.UNSUPPORTED msg = "Docker binary not found on instance" return {} return { "containers.num_running": { "label": "Number of containers running", "options": None, "status": status, "error_message": "", "unit": "count", }, "containers.num_running_img": { "label": "Number of containers running image", "options": None, "option_string": True, "status": status, "error_message": "", "unit": "count", }, "containers.num_running_name": { "label": "Number of containers running by name", "options": None, "option_string": True, "status": status, "error_message": "", "unit": "count", } } @classmethod def get_metadata_docker(self, container, config): status = agent_util.SUPPORTED msg = None metadata = {} metadata.update(self.get_cpu_metadata(container, config)) metadata.update(self.get_memory_metadata(container, config)) metadata.update(self.get_network_metadata(container, config)) metadata.update(self.get_io_metadata(container, config)) metadata.update({ # Container is running "status.running": { "label": "Container is Running", "options": None, "status": agent_util.SUPPORTED, "error_message": "", "unit": "boolean", }, # Disk metrics are always available "disk.size_rw": { "label": "Size RW", "options": None, "status": agent_util.SUPPORTED, "error_message": "", "unit": "bytes", }, "disk.size_root_fs": { "label": "Size Root FS", "options": None, "status": agent_util.SUPPORTED, "error_message": "", "unit": "bytes", } }) return metadata @classmethod def get_cpu_metadata(self, container, config): container_id = container["Id"] cpu_metadata = { "cpu.usage_percentage": { "label": "CPU Usage Percentage", "options": None, "status": agent_util.SUPPORTED, "error_message": '', "unit": "%", }, "cpu.user_usage": { "label": "CPU Percent Used [User]", "options": None, "status": agent_util.SUPPORTED, "error_message": '', "unit": "%", }, "cpu.sys_usage": { "label": "CPU Percent Used [System]", "options": None, "status": agent_util.SUPPORTED, "error_message": '', "unit": "%", } } if self.is_cgroups_v2(): stat_file = f'/sys/fs/cgroup/system.slice/docker-{container_id}.scope/cpu.stat' stats = DockerPlugin.read_stats_from_file(stat_file) textkey_map = { "cpu.usage_percentage" : "usage_usec", "cpu.user_usage" : "user_usec", "cpu.sys_usage" : "system_usec" } for key in cpu_metadata.keys(): if stats.get(textkey_map[key], None) is None: cpu_metadata[key]['error_message'] = 'Cannot access docker stats file' cpu_metadata[key]['status'] = agent_util.UNSUPPORTED else: # map textkey to docker interface file used for metrics textkey_map = { "cpu.usage_percentage" : 'cpuacct.usage', "cpu.user_usage" : 'cpuacct.usage_user', "cpu.sys_usage" : 'cpuacct.usage_sys' } for textkey in cpu_metadata.keys(): file_name = textkey_map.get(textkey, None) if file_name is None: self.log.warning(f'Docker CPU metadata: missing map key {textkey}') continue fpath = f"/sys/fs/cgroup/cpuacct/docker/{container_id}/{file_name}" metric = DockerPlugin.read_single_stat_file(fpath) if metric is None: cpu_metadata[textkey]['status'] = agent_util.UNSUPPORTED cpu_metadata[textkey]['error_msg'] = "Can't access '{}'".format(file_name) return cpu_metadata @classmethod def read_single_stat_file(self, file_name): try: with open(file_name, "r") as f: metric = f.read() return float(metric) except Exception: self.log.exception('Read stat file {} failure'.format(file_name)) return None @classmethod def read_stats_from_file(self, file_name): try: with open(file_name, "r") as f: output = f.readlines() stats = {} for line in output: stat_type, num = line.split(" ") stats[stat_type] = float(num) return stats except Exception: self.log.exception('Read stats from {} failure'.format(file_name)) return {} @classmethod def get_memory_metadata(self, container, config): container_id = container["Id"] if self.is_cgroups_v2(): memory_metadata = { "memory.usage": { "label": "Memory Used", "options": None, "status": agent_util.SUPPORTED, "error_message": '', "unit": "bytes", }, "memory.mapped_file": { "label": "Memory Mapped File", "options": None, "status": agent_util.SUPPORTED, "error_message": "", "unit": "bytes", } } stats_file = f'/sys/fs/cgroup/system.slice/docker-{container_id}.scope/memory.stat' metrics = DockerPlugin.read_stats_from_file(stats_file) if metrics.get('file_mapped', None) is None: memory_metadata['memory.mapped_file']['error_message'] = f'Cannot read {stats_file}' memory_metadata['memory.mapped_file']['status'] = agent_util.UNSUPPORTED memory_current = f'/sys/fs/cgroup/system.slice/docker-{container_id}.scope/memory.current' metric = DockerPlugin.read_single_stat_file(memory_current) if metric is None: memory_metadata['memory.usage']['error_message'] = f'Cannot read {stats_file}' memory_metadata['memory.usage']['status'] = agent_util.UNSUPPORTED return memory_metadata memory_metadata = { "memory.usage": { "label": "Memory Used", "options": None, "status": agent_util.SUPPORTED, "error_message": '', "unit": "bytes", }, "memory.cache": { "label": "Memory Cached", "options": None, "status": agent_util.SUPPORTED, "error_message": "", "unit": "bytes", }, "memory.rss": { "label": "Memory RSS", "options": None, "status": agent_util.SUPPORTED, "error_message": "", "unit": "bytes", }, "memory.mapped_file": { "label": "Memory Mapped File", "options": None, "status": agent_util.SUPPORTED, "error_message": "", "unit": "bytes", }, "memory.swap": { "label": "Swap Used", "options": None, "status": agent_util.SUPPORTED, "error_message": "", "unit": "bytes", } } total_metric = self.read_single_stat_file("/sys/fs/cgroup/memory/docker/%s/memory.usage_in_bytes" % container_id) if total_metric is None: memory_metadata['memory.usage']['status'] = agent_util.UNSUPPORTED memory_metadata['memory.usage']['error_message'] = "Can't access 'memory.usage_in_bytes'" stats = self.read_stats_from_file("/sys/fs/cgroup/memory/docker/%s/memory.stat" % container_id) for key in memory_metadata.keys(): if 'memory.usage' == key: continue metric_key = key.split('.')[1] if metric_key not in stats: memory_metadata[key]['status'] = agent_util.UNSUPPORTED memory_metadata[key]['error_msg'] = 'Unable to read stats file' return memory_metadata @classmethod def get_network_metadata(self, container, config): container_id = container["Id"] status = agent_util.UNSUPPORTED msg = "" # Get the PID try: conn = agent_util.UnixHTTPConnection(DOCKER_SOCKET) conn.request('GET', '/containers/%s/json' % container_id, headers={'Host': 'localhost'}) r = conn.getresponse().read() j = agent_util.json_loads(r) container_pid = j["State"]["Pid"] except Exception: container_pid = None msg = "Can't get container's PID" if container_pid: try: with open("/proc/%s/net/dev" % container_pid, "r") as f: output = f.readlines() eth0 = False for line in output: if line.lstrip().startswith("eth0:"): eth0 = True split = line.split() if len(split) == 17: status = agent_util.SUPPORTED else: msg = "Unexpected # of columns in /proc//net/dev" break if not eth0: msg = "Can't find eth0 device on container" except Exception: msg = "Can't access /proc//net/dev" return { "net.rx_bytes": { "label": "Bytes In Per Second", "options": None, "status": status, "error_message": msg, "unit": "bytes/sec", }, "net.rx_packets": { "label": "Packets In Per Second", "options": None, "status": status, "error_message": msg, "unit": "packets/sec", }, "net.rx_errs": { "label": "RX Errors Per Second", "options": None, "status": status, "error_message": msg, "unit": "errors/sec", }, "net.tx_bytes": { "label": "Bytes Out Per Second", "options": None, "status": status, "error_message": msg, "unit": "bytes/sec", }, "net.tx_packets": { "label": "Packets Out Per Second", "options": None, "status": status, "error_message": msg, "unit": "packets/sec", }, "net.tx_errs": { "label": "TX Errors Per Second", "options": None, "status": status, "error_message": msg, "unit": "errors/sec", }, } @classmethod def get_io_metadata(self, container, config): container_id = container["Id"] if self.is_cgroups_v2(): return { "io.bytes_in": { "label": "Bytes In", "options": None, "status": agent_util.SUPPORTED, "error_message": '', "unit": "bytes", }, "io.bytes_out": { "label": "Bytes Out", "options": None, "status": agent_util.SUPPORTED, "error_message": '', "unit": "bytes", } } service_bytes = agent_util.UNSUPPORTED service_bytes_message = "" try: fpath = "/sys/fs/cgroup/blkio/docker/%s/blkio.throttle.io_service_bytes" % container_id with open(fpath, "r") as f: f.read() service_bytes = agent_util.SUPPORTED except Exception: service_bytes_message = "Can't access 'blkio.throttle.io_service_bytes'" operations = agent_util.UNSUPPORTED operations_message = "" try: fpath = "/sys/fs/cgroup/blkio/docker/%s/blkio.throttle.io_serviced" % container_id with open(fpath, "r") as f: f.read() operations = agent_util.SUPPORTED except Exception: operations_message = "Can't access 'blkio.throttle.io_serviced'" return { "io.bytes_written": { "label": "Bytes Written Per Second", "options": None, "status": service_bytes, "error_message": service_bytes_message, "unit": "bytes/s", }, "io.bytes_read": { "label": "Bytes Read Per Second", "options": None, "status": service_bytes, "error_message": service_bytes_message, "unit": "bytes/s", }, "io.write_ops": { "label": "Writes Per Second", "options": None, "status": operations, "error_message": operations_message, "unit": "w/s", }, "io.read_ops": { "label": "Reads Per Second", "options": None, "status": operations, "error_message": operations_message, "unit": "r/s", }, } ######################################################### # Checks # ######################################################### def check(self, textkey, data, config): if textkey.startswith("containers."): return self.get_containers_metric(textkey, data, config) def get_containers_metric(self, textkey, data, config): def get_running_containers(): try: conn = agent_util.UnixHTTPConnection(DOCKER_SOCKET) conn.request('GET', '/containers/json', headers={'Host': 'localhost'}) r = conn.getresponse().read() j = agent_util.json_loads(r) return j except Exception: self.log.exception('Get running containers error') return None if textkey == "containers.num_running": running = get_running_containers() return len(running) elif textkey == "containers.num_running_img": running = get_running_containers() search = data or "*" search = search.replace('*', '.*') search = search.replace('""', '.*') count = 0 for container in running: image = container.get("Image", "") if re.search(search, image): count += 1 return count elif textkey == "containers.num_running_name": running = get_running_containers() search = data or "*" search = search.replace('*', '.*') search = search.replace('""', '.*') count = 0 for container in running: names = container.get("Names", []) for name in names: if re.search(search, name): count += 1 return count def check_docker(self, container, textkey, data, config): if textkey.startswith("cpu."): return self.get_cpu_metric(container, textkey, data, config) elif textkey.startswith("memory."): return self.get_memory_metric(container, textkey, data, config) elif textkey.startswith("net."): return self.get_network_metric(container, textkey, data, config) elif textkey.startswith("disk."): return self.get_disk_metric(container, textkey, data, config) elif textkey.startswith("io."): return self.get_io_metric(container, textkey, data, config) elif textkey.startswith("status."): return self.get_status_metric(container, textkey, data, config) return None def _read_cpu_metric(self, textkey, container_id): if self.is_cgroups_v2(): stat_file = f'/sys/fs/cgroup/system.slice/docker-{container_id}.scope/cpu.stat' stats = DockerPlugin.read_stats_from_file(stat_file) if "cpu.usage_percentage" == textkey: return stats.get("usage_usec", None) elif "cpu.user_usage" == textkey: return stats.get("user_usec", None) elif "cpu.sys_usage" == textkey: return stats.get("system_usec", None) self.log.warning(f'Unrecognized textkey {textkey} in _read_cpu_metric') return None stat_file = None base_path = f'/sys/fs/cgroup/cpuacct/docker/{container_id}' if "cpu.usage_percentage" == textkey: stat_file = os.path.join(base_path, 'cpuacct.usage') elif "cpu.user_usage" == textkey: stat_file = os.path.join(base_path, 'cpuacct.usage_user') elif "cpu.sys_usage" == textkey: stat_file = os.path.join(base_path, 'cpuacct.usage_sys') if stat_file is None: self.log.error(f'Unrecognized textkey {textkey} in _read_cpu_metric') return None return DockerPlugin.read_single_stat_file(stat_file) def get_cpu_metric(self, container, textkey, data, config): container_id = container["Id"] def get_total_system(): cpu_times = psutil.cpu_times() total_system = 0 for key in ['user', 'nice', 'system', 'idle', 'iowait', 'irq', 'softirq']: total_system += getattr(cpu_times, key) * 100 total_system = (total_system * NANOSECONDS) / CLOCK_TICKS if self.is_cgroups_v2(): total_system = total_system / 1000 return total_system if textkey == "cpu.usage_percentage": last_system = self.get_cache_results('docker.cpu.usage_percentage', 'total_system') if last_system: last_system = last_system[0][1] else: last_system = None last_container = self.get_cache_results('docker.cpu.usage_percentage', container_id) if last_container: last_container = last_container[0][1] else: last_container = None total_system = get_total_system() self.cache_result('docker.cpu.usage_percentage', 'total_system', total_system, replace=True) total_container = self._read_cpu_metric(textkey, container_id) if total_container is None: return None self.cache_result('docker.cpu.usage_percentage', container_id, total_container, replace=True) if last_system is None or last_container is None: return None container_delta = total_container - last_container system_delta = total_system - last_system num_cpus = psutil.cpu_count() return (float(container_delta) / system_delta) * num_cpus * 100. elif textkey == "cpu.user_usage": last_system = self.get_cache_results('docker.cpu.user_usage', 'total_system') if last_system: last_system = last_system[0][1] else: last_system = None last_container = self.get_cache_results('docker.cpu.user_usage', container_id) if last_container: last_container = last_container[0][1] else: last_container = None total_system = get_total_system() self.cache_result('docker.cpu.user_usage', 'total_system', total_system, replace=True) container_val = self._read_cpu_metric(textkey, container_id) if container_val is None: return None self.cache_result('docker.cpu.user_usage', container_id, container_val, replace=True) if last_system is None or last_container is None: return None container_delta = container_val - last_container system_delta = total_system - last_system num_cpus = psutil.cpu_count() return (float(container_delta) / system_delta) * num_cpus * 100. elif textkey == "cpu.sys_usage": last_system = self.get_cache_results('docker.cpu.sys_usage', 'total_system') if last_system: last_system = last_system[0][1] else: last_system = None last_container = self.get_cache_results('docker.cpu.sys_usage', container_id) if last_container: last_container = last_container[0][1] else: last_container = None total_system = get_total_system() self.cache_result('docker.cpu.sys_usage', 'total_system', total_system, replace=True) container_val = self._read_cpu_metric(textkey, container_id) if container_val is None: return None self.cache_result('docker.cpu.sys_usage', container_id, container_val, replace=True) if last_system is None or last_container is None: return None container_delta = container_val - last_container system_delta = total_system - last_system num_cpus = psutil.cpu_count() return (float(container_delta) / system_delta) * num_cpus * 100. def get_memory_metric(self, container, textkey, data, config): container_id = container["Id"] def get_total_bytes(): fname = "/sys/fs/cgroup/memory/docker/%s/memory.usage_in_bytes" % container_id if self.is_cgroups_v2(): fname = f'/sys/fs/cgroup/system.slice/docker-{container_id}.scope/memory.current' return DockerPlugin.read_single_stat_file(fname) def get_memory_stats(): fname = "/sys/fs/cgroup/memory/docker/%s/memory.stat" % container_id if self.is_cgroups_v2(): fname = f'/sys/fs/cgroup/system.slice/docker-{container_id}.scope/memory.stat' return DockerPlugin.read_stats_from_file(fname) if textkey == "memory.usage": try: total_bytes = get_total_bytes() if self.is_cgroups_v2(): return total_bytes memory_stats = get_memory_stats() return total_bytes - memory_stats["cache"] except Exception: self.log.exception('Docker get memory.usage error') return None elif textkey in ["memory.cache", "memory.rss", "memory.mapped_file", "memory.swap"]: try: memory_stats = get_memory_stats() if not self.is_cgroups_v2(): stat_type = textkey.split(".")[1] return memory_stats[stat_type] if 'memory.mapped_file' == textkey: return memory_stats['file_mapped'] raise Exception(f'Unrecognized textkey {textkey}') except Exception: self.log.exception('Docker get {} error'.format(textkey)) return None def get_container_pid(self, container): conn = None try: container_id = container['Id'] conn = agent_util.UnixHTTPConnection(DOCKER_SOCKET) conn.request( 'GET', '/containers/%s/json' % container_id, headers={'Host': 'localhost'} ) r = conn.getresponse().read() j = agent_util.json_loads(r) return j["State"]["Pid"] except Exception: self.log.exception('Get container pid error') return None finally: try: conn.close() except: pass def get_network_metric(self, container, textkey, data, config): container_id = container["Id"] # Find the container's PID container_pid = self.get_container_pid(container) if container_pid is None: return None def get_proc_stats(pid): proc_file = "/proc/%s/net/dev" % pid with open(proc_file, "r") as f: content = f.readlines() eth0_line = None for line in content: if line.lstrip().startswith("eth0:"): eth0_line = line break if not eth0_line: raise Exception("No line for eth0 in {}".format(proc_file)) eth0_line = eth0_line.split() keys = ["", "rx_bytes", "rx_packets", "rx_errs", "", "", "", "", "", "", "tx_bytes", "tx_packets", "tx_errs", "", "", "", "", ""] stats = {} for col, text in enumerate(eth0_line): key = keys[col] if key: stats[key] = int(text) return stats if textkey in ["net.rx_bytes", "net.rx_packets", "net.rx_errs", "net.tx_bytes", "net.tx_packets", "net.tx_errs"]: key = textkey.split(".")[1] last = self.get_cache_results('docker.net', key) if last: last_val = last[0][1] seconds = last[0][0] else: last_val = None seconds = None try: stats = get_proc_stats(container_pid) stat = stats[key] except Exception: self.log.exception("Error accessing /proc/%s/net/dev: %s", container_pid, e) return None self.cache_result('docker.net', key, stat, replace=True) if last_val is None: return None return (stat - last_val) / seconds def get_disk_metric(self, container, textkey, data, config): container_id = container["Id"] try: conn = agent_util.UnixHTTPConnection(DOCKER_SOCKET) conn.request('GET', '/containers/%s/json?size=true' % container_id, headers={'Host': 'localhost'}) r = conn.getresponse().read() j = agent_util.json_loads(r) except Exception: self.log.exception('Docker get disk metric error') return None if textkey == "disk.size_rw": return j.get("SizeRw", None) elif textkey == "disk.size_root_fs": return j.get("SizeRootFs", None) def get_metric_as_bytes(self, metric_string): try: index = 0 while True: if metric_string[index].isdigit() or '.' == metric_string[index]: index += 1 continue break metric_value = float(metric_string[0:index]) units = metric_string[index:].lower() self.log.info('metric_string {} -> {} {}'.format(metric_string, metric_value, units)) conversion = 1 if 'kib' == units: conversion = 1000 elif 'mib' == units: conversion = math.pow(1024, 2) elif 'gib' == units: conversion = math.pow(1024, 3) elif 'kb' == units: conversion = 1000 elif 'mb' == units: conversion = math.pow(1000, 2) elif 'gb' == units: conversion = math.pow(1000, 3) return metric_value * conversion except Exception: self.log.exception('get_metric_as_bytes error') return None def _get_docker_block_stats(self, container, textkey): """ Read the I/O metrics from docker stats, because the /proc io file is read-only to root. """ import json def parse_multi_metric_entry(metric_line): items = metric_line.split('/') metrics = [item.strip() for item in items] rv = [] for metric in metrics: rv.append(self.get_metric_as_bytes(metric)) return rv try: container_id = container['Id'] rc, output = agent_util.execute_command( 'docker stats {} --no-stream --format json'.format(container_id), cache_timeout=agent_util.DEFAULT_CACHE_TIMEOUT ) if 0 != rc: self.log.error('Docker stats failure: {}'.format(rc)) return None data = agent_util.json_loads(output) self.log.debug('Docker Stats result: {}'.format(json.dumps(data, indent=1))) mld = parse_multi_metric_entry(data['BlockIO']) if 2 != len(mld): self.log.error('get_docker_block_stats error: Unexpected metric count') self.log.info(output) return None if "io.bytes_out" == textkey: return mld[1] elif "io.bytes_in" == textkey: return mld[0] else: return None except Exception: self.log.exception('get_docker_block_stats error') return None def get_io_metric(self, container, textkey, data, config): container_id = container["Id"] def get_total(fname, operation_type): with open(fname, "r") as f: lines = f.readlines() total = 0 for line in lines: if line.startswith("Total"): continue device, op_type, num = line.split(" ") if op_type == operation_type: total += int(num) return total key = textkey.split(".")[1] last = self.get_cache_results('docker.io', key) if last: last_val = last[0][1] seconds = last[0][0] else: last_val = None seconds = None new_val = None if textkey in ['io.bytes_out', 'io.bytes_in']: if self.is_cgroups_v2(): return self._get_docker_block_stats(container, textkey) raise Exception('{} not supported in cgroups v2'.format(textkey)) if textkey in ["io.bytes_written", "io.bytes_read"]: try: fname = "/sys/fs/cgroup/blkio/docker/%s/blkio.throttle.io_service_bytes" % container_id if "written" in textkey: new_val = get_total(fname, "Write") elif "read" in textkey: new_val = get_total(fname, "Read") except Exception: self.log.error("Error accessing %s", fname) return None elif textkey in ["io.write_os", "io.read_ops"]: try: fname = "/sys/fs/cgroup/blkio/docker/%s/blkio.throttle.io_serviced" % container_id if "write" in textkey: new_val = get_total(fname, "Write") elif "read" in textkey: new_val = get_total(fname, "Read") except Exception: self.log.error("Error accessing %s", fname) return None if new_val is None: return None self.cache_result('docker.io', key, new_val, replace=True) if last_val is None: return None return (new_val - last_val) / seconds def get_status_metric(self, container, textkey, data, config): if textkey == "status.running": if container.get("State") != "running": return 0 return 1 if __name__ == "__main__": plugin = DockerPlugin(None) plugin.get_metadata({})