ganglia分布式监控系统

为什么80%的码农都做不了架构师？>>>

一．介绍 Ganglia是由UC Berkeley发起的一个开源监控项目，设计用于监控数以千几的节点。每台服务器都运行一个收集和发送监控数据名为gmond的守护进程。它将从操作系统和指定主机中收集。接收所有监控数据的主机可以显示这些数据并且可以将这些数据的精简表单传递到层次结构中。正因为有这种层次架构模式，使ganglia可以实现良好的扩展。Gmond带来的系统负载非常小，这使得它成为集群中各个服务器上运行一段代码而不会影响用户性能。 Ganglia主要用来监控系统性能的软件，通过曲线很容易见到每个节点的工作状态，对合理调整，分配系统资源，提高系统整体性能起到重要作用，支持浏览器方式访问，但不能监控节点硬件技术指标。Ganglia是分布式的监控系统。 Ganglia的组件 Ganglia包括如下程序，它们之间通过xml格式传递监控数据。服务端ganglia meta daemon(gmetad)：负责收集各个cluster的数据，并更新到rrd数据库中客户端ganglia monitoring daemon(gmond)：收集本机的监控数据，发送到其他服务器上，收集其他服务器的监控数据，供gmetad读取。基于web的动态访问方式ganglia PHP web Frontend：一个基于web的监控界面，需要和gmetad安装在同一个节点上，从gmetad取数据，并且读取rrd数据库，生成图片显示。 Ganglia工作模式 Ganglia收集数据可以工作在单播或多播模式下，默认为多播模式单播：发送自己收集到的监控数据到特定的一台或几台服务器上，可以跨网段。多播：发送自己收集到的监控数据到同一网段所有的服务器上，同时收集同一网段的所有服务器发送过来的监控数据。因为是以广播包的形式发送，因此需要在同一网段内，但同一网段内，又可以定义不同的发送通道。

二．安装 # apt-get install libconfuse-dev expat libpcre3-dev libpango1.0-dev libxml2-dev libapr1-dev libexpat-dev libpcre3-dev rrdtool librrds-perl librrd2-dev python-dev # wget http://nchc.dl.sourceforge.net/project/ganglia/ganglia%20monitoring%20core/3.2.0/ganglia-3.2.0.tar.gz # tar zxvf ganglia-3.2.0.tar.gz -C ../software/ # ./configure --prefix=/usr/local/ganglia-3.2.0 --with-gmetad --enable-gexec # make # make install 三．配置 # vim gmetad.conf data_source "cluster-db" node1 node2 //定义集群名称，以及集群中的节点。由于采用multicast模式，每台gmond节点都有本集群内节点服务器的所有监控数据，因此不必把所有节点都列出来。建议写入不低于2个，在node1节点当掉后，会自动找node2节点取数据。启动gmetad时，会进行域名解析的。 data_source "cluster-memcache" 192.168.41.129 rrd_rootdir "/data/ganglia/rrds" //定义rrd数据库的存放路径，gmetad收集到监控数据后，会更新到该目录下对应的rrd数据库中。 case_sensitive_hostnames 1 # /usr/local/ganglia-3.2.0 /gmond -t > /usr/local/ganglia-3.2.0/etc/gmond.conf # vim gmond.conf globals { daemonize = yes //守护进程运行 setuid = yes user = nobody //运行用户 debug_level = 0 //调式级别 max_udp_msg_len = 1472 //upd包长度 mute = no //哑巴，本节点将不会再广播任何自己收集到的数据到网络上 deaf = no //聋子，本节点将不再接收任何其他节点广播的数据包 allow_extra_data = yes host_dmax = 0 /*secs */ host_tmax = 20 /*secs */ cleanup_threshold = 300 /*secs */ gexec = no //是否使用gexec send_metadata_interval = 0 /*secs */ } cluster { name = "cluster-db" //本节点属于哪个cluster，需要与data_source对应 owner = "xuhh" //谁是该节点的所有者 latlong = "unspecified" url = "unspecified" } host { location = "node1" } udp_send_channel { //udp发送通道 mcast_join = 239.2.11.71 //多播地址，工作在239.2.11.71通道下。如果使用单播模式，则要写host=node1，单播模式下可以配置多个upd_send_channel port = 8649 //监听端口 ttl = 1 } udp_recv_channel { //udp接收通道 mcast_join = 239.2.11.71 port = 8649 bind = 239.2.11.71 } tcp_accept_channel { //tcp接收通道，可以配置多个tcp_accept_channels来共享集群中监控数据 port = 8649 //远端可以通过连接8649端口来得到监控数据 } modules { //监控模块 module { name = "core_metrics" } module { name = "cpu_module" path = "modcpu.so" } module { name = "disk_module" path = "moddisk.so" } module { name = "load_module" path = "modload.so" } module { name = "mem_module" path = "modmem.so" } module { name = "net_module" path = "modnet.so" } module { name = "proc_module" path = "modproc.so" } module { name = "sys_module" path = "modsys.so" } } /* This collection group will cause a heartbeat (or beacon) to be sent every 20 seconds. In the heartbeat is the GMOND_STARTED data which expresses the age of the running gmond. */ collection_group { collect_once = yes time_threshold = 20 metric { name = "heartbeat" } } /* This collection group will send general info about this host every 1200 secs. This information doesn't change between reboots and is only collected once. */ collection_group { collect_once = yes time_threshold = 1200 metric { name = "cpu_num" title = "CPU Count" } metric { name = "cpu_speed" title = "CPU Speed" } metric { name = "mem_total" title = "Memory Total" } /* Should this be here? Swap can be added/removed between reboots. */ metric { name = "swap_total" title = "Swap Space Total" } metric { name = "boottime" title = "Last Boot Time" } metric { name = "machine_type" title = "Machine Type" } metric { name = "os_name" title = "Operating System" } metric { name = "os_release" title = "Operating System Release" } metric { name = "location" title = "Location" } } /* This collection group will send the status of gexecd for this host every 300 secs.*/ /* Unlike 2.5.x the default behavior is to report gexecd OFF. */ collection_group { collect_once = yes time_threshold = 300 metric { name = "gexec" title = "Gexec Status" } } /* This collection group will collect the CPU status info every 20 secs. The time threshold is set to 90 seconds. In honesty, this time_threshold could be set significantly higher to reduce unneccessary network chatter. */ collection_group { collect_every = 20 time_threshold = 90 /* CPU status */ metric { name = "cpu_user" value_threshold = "1.0" title = "CPU User" } metric { name = "cpu_system" value_threshold = "1.0" title = "CPU System" } metric { name = "cpu_idle" value_threshold = "5.0" title = "CPU Idle" } metric { name = "cpu_nice" value_threshold = "1.0" title = "CPU Nice" } metric { name = "cpu_aidle" value_threshold = "5.0" title = "CPU aidle" } metric { name = "cpu_wio" value_threshold = "1.0" title = "CPU wio" } /* The next two metrics are optional if you want more detail... ... since they are accounted for in cpu_system. metric { name = "cpu_intr" value_threshold = "1.0" title = "CPU intr" } metric { name = "cpu_sintr" value_threshold = "1.0" title = "CPU sintr" } */ } collection_group { collect_every = 20 time_threshold = 90 /* Load Averages */ metric { name = "load_one" value_threshold = "1.0" title = "One Minute Load Average" } metric { name = "load_five" value_threshold = "1.0" title = "Five Minute Load Average" } metric { name = "load_fifteen" value_threshold = "1.0" title = "Fifteen Minute Load Average" } } /* This group collects the number of running and total processes */ collection_group { collect_every = 80 time_threshold = 950 metric { name = "proc_run" value_threshold = "1.0" title = "Total Running Processes" } metric { name = "proc_total" value_threshold = "1.0" title = "Total Processes" } } /* This collection group grabs the volatile memory metrics every 40 secs and sends them at least every 180 secs. This time_threshold can be increased significantly to reduce unneeded network traffic. */ collection_group { collect_every = 40 time_threshold = 180 metric { name = "mem_free" value_threshold = "1024.0" title = "Free Memory" } metric { name = "mem_shared" value_threshold = "1024.0" title = "Shared Memory" } metric { name = "mem_buffers" value_threshold = "1024.0" title = "Memory Buffers" } metric { name = "mem_cached" value_threshold = "1024.0" title = "Cached Memory" } metric { name = "swap_free" value_threshold = "1024.0" title = "Free Swap Space" } } collection_group { collect_every = 40 time_threshold = 300 metric { name = "bytes_out" value_threshold = 4096 title = "Bytes Sent" } metric { name = "bytes_in" value_threshold = 4096 title = "Bytes Received" } metric { name = "pkts_in" value_threshold = 256 title = "Packets Received" } metric { name = "pkts_out" value_threshold = 256 title = "Packets Sent" } } /* Different than 2.5.x default since the old config made no sense */ collection_group { collect_every = 1800 time_threshold = 3600 metric { name = "disk_total" value_threshold = 1.0 title = "Total Disk Space" } } collection_group { collect_every = 40 time_threshold = 180 metric { name = "disk_free" value_threshold = 1.0 title = "Disk Space Available" } metric { name = "part_max_used" value_threshold = 1.0 title = "Maximum Disk Space Used" } } include ("/usr/local/ganglia-3.2.0/etc/conf.d/*.conf") # mkdir –p /data/ganglia/{rrds, dwoo} # chown –R nobody.nobody /data/ganglia # chmod –R 777 /data/ganglia/rrds 对于有两张网卡的服务器，服务器之间的监控通信通过内网，需要如下设置组播网关： # ip route add 239.2.11.71 dev eth0 //eth0是内网卡客户端：安装如上所示 # vim gmond.conf cluster { name = "cluster-memcache" owner = "xuhh" latlong = "unspecified" url = "unspecified" } host { location = "192.168.41.129" } ganglia PHP web Frontend配置： PHP环境配置省略 # cp -a /usr/local/src/software/ganglia-3.2.0/web/ /var/www/ganglia # vim /var/www/ganglia/conf.php $gmetad_root = "/data/ganglia"; 三．监控图表

转载于:https://my.oschina.net/766/blog/211502

ganglia分布式监控系统

猜你喜欢