一句话,程序员必须是万能的!
贴一篇文章,记录一下使用nagios监控Hadoop集群的过程,以免忘了。
安装过程(所有下载的文件都放在/usr/local/download)
#基础依赖
# yum install php php-gd -y
# yum -y install xinetd
# passwd nagios 为nagios用户添加密码
# groupadd nagcmd 创建用户组用于Web接口执行外部命令
# usermod –G nagcmd nagios 把nagios 用户加入nagcmd组
# usermod –G nagcmd apache 把apache 用户加入nagcmd组
安装nagios
# cd download
# wget http://cdnetworks-kr-2.dl.sourceforge.net/project/nagios/nagios-3.x/nagios-3.3.1/nagios-3.3.1.tar.gz
# tar zxf nagios-3.3.1.tar.gz
# cd nagios
# ./configure --with-command-group=nagcmd
# make all
# make install
# make install-init
# make install-commandmode
# make install-config
# make install-webconf
修改/usr/local/nagios/etc/objects/contacts.cfg文件中联系人邮箱
添加认证用户nagiosadmin
安装plugin
下载到download
# tar zxf nagios-plugins-1.4.15.tar.gz
# cd nagios-plugins-1.4.15
# ./configure --with-nagios-user=nagios --with-nagios-group=nagios
# make && make install
将nagios、httpd服务加入到系统服务,开机启动
# chkconfig nagios on
# chkconfig --add httpd
# chkconfig httpd on
验证配置文件&启动nagios httpd
# service nagios start
# service nagios start
注:访问ip:8008/nagis/时,出现服务器内部错误问题,经查是cgi执行权限的原因,通过
一下命令解决:
# chcon -R -t httpd_sys_content_t /usr/local/nagios/share/
{
# chown nagios.nagios /usr/local/nagios/
# chown -R nagios.nagios /usr/local/nagios/libexec/
}
安装nrpe
先安装ssl
下载nrpe
# tar zxf nrpe-2.13.tar.gz
# cd nrpe-2.13
# ./configure
# make all
# make install-plugin
# make install-daemon
# make install-daemon-config
# make install-xinetd
修改commands.cfg 文件,最后加入一段
# 'check_nrpe' command definition
define command{
command_name check_nrpe
command_line $USER1$/check_nrpe -H $HOSTADDRESS$ -c $ARG1$
}
修改service ,添加一行
nrpe 5666/tcp #NRPE
# vim /etc/xinetd.d/nrpe
only_from = 127.0.0.1后加入 117....(不同地址间加空格)
重启xinetd
重启nrpe
# /usr/local/nagios/bin/nrpe -c /usr/local/nagios/etc/nrpe.cfg -d
# /usr/local/nagios/libexec/check_nrpe -H 192.168.198.207
测试本机连通性
NRPE v2.13
测试本机负载
OK - load average: 0.06, 0.12, 0.17|load1=0.060;15.000;30.000;0; load5=0.120;10.000;25.000;0; load15=0.170;5.000;20.000;0;
安装pnp4nagios
安装 相关依赖
下载pnp
下载rrdtool
# tar zxf rrdtool-1.4.7.tar.gz
# cd rrdtool-1.4.7
# ./configure --prefix=/usr/local/rrdtool
# make && make install
# cd ..
# tar zxf pnp4nagios-0.6.17.tar.gz
# cd pnp4nagios-0.6.17
# ./configure --prefix=/usr/local/pnp4nagios --with-rrdtool=/usr/local/rrdtool/bin/rrdtool
(出现错误:configure: error: Perl Module Time::HiRes not available
解决办法:# yum install perl* -y
或者:# yum install perl-Time-HiRes
# yum install rrdtool-perl
)
# make all
# make install
# make install-webconf
# make install-config
# make install-init
删除 或者重命名install.php
# mv install.php install.php.lock
拷贝样例文件
# mv misccommands.cfg-sample misccommands.cfg
# mv nagios.cfg-sample nagios.cfg
# mv rra.cfg-sample rra.cfg
# cd pages/
# mv web_traffic.cfg-sample web_traffic.cfg
# cd ../check_commands/
# mv check_all_local_disks.cfg-sample check_all_local_disks.cfg
# mv check_nrpe.cfg-sample check_nrpe.cfg
# mv check_nwstat.cfg-sample check_nwstat.cfg
# service npcd restart
# cp /usr/local/download/pnp4nagios-0.6.17/contrib/ssi//* /usr/local/nagios/share/ssi/
#回到解压目录下,复制鼠标悬停图标显示,预览绘图所需的文件
修改nagios 的模板文件,在对应的主机和服务定义字段里加入以下内容,
# 'process-host-perfdata' command definition
define command{
command_name process-host-perfdata
command_line /usr/local/pnp4nagios/libexec/process_perfdata.pl -d HOSTPERFDATA
}
# 'process-service-perfdata' command definition
define command{
command_name process-service-perfdata
command_line /usr/local/pnp4nagios/libexec/process_perfdata.pl
}
# vi /usr/local/nagios/etc/nagios.cfg
process_performance_data=1
#将此变量值设为1
host_perfdata_command=process-host-perfdata
service_perfdata_command=process-service-perfdata
#取消这两项的注释
编辑主机定义
在define host 的use中加入host-pnp
在define Service 的use中加入srv-pnp
重启nagios 可以看到图标
添加规则,然后保存一下就可以。
# /etc/rc.d/init.d/iptables save
如果还不管用就重启一下防火墙
# /etc/init.d/iptables restart
添加被监控主机:
安装nagios-plugin和nrpe(安装过程略)
添加
#add nrpe
nrpe 5666/tcp # nrpe
# vi /usr/local/nagios/etc/nrpe.cfg
修改
allowed_hosts=127.0.0.1, 117.40.133.17, 192.168.198.207
主控机配置:
加一行
#add linux cfg
cfg_file=/usr/local/nagios/etc/objects/linux.cfg
编辑/usr/local/nagios/etc/objects/linux.cfg的内容为
use linux-server
host_name centos207
alias centos207
address 192.168.198.207
}
define service{
use generic-service
host_name centos207
service_description HTTP
check_command check_http
}
define service{
use generic-service
host_name centos207
service_description FTP
check_command check_ftp
}
define service{
use generic-service
host_name centos207
service_description SSH
check_command check_ssh
}
define service{
use generic-service
host_name centos207
service_description SMTP
check_command check_smtp
}
define service{
use generic-service
host_name centos207
service_description POP3
check_command check_pop
}
define service{
use generic-service
host_name centos207
service_description check-swap
check_command check_nrpe!check_swap
}
define service{
use generic-service
host_name centos207
service_description check-load
check_command check_nrpe!check_load
}
define service{
use generic-service
host_name centos207
service_description check-disk
check_command check_nrpe!check_sda1
}
define service{
use generic-service
host_name centos207
service_description zombie_procs
check_command check_nrpe!check_zombie_procs
}
define service{
use generic-service
host_name centos207
service_description check-users
check_command check_nrpe!check_users
}
define service{
use generic-service
host_name centos207
service_description total_procs
check_command check_nrpe!check_total_procs
}