任务1-1
1、创建hadoop用户
sudo useradd -m hadoop 创建用户
sudo passwd hadoop 设置密码
2、安装配置ssh
安装ssh server:sudo apt-get install openssh-server
cd ~/.ssh/ # 若没有该目录,请先执行一次ssh localhost
ssh-keygen -t rsa # 会有提示,都按回车就可以
cat id_rsa.pub >> authorized_keys # 加入授权
使用ssh localhost试试能否直接登录
3、安装配置JDK
cd /usr/lib/ 打开/usr/lib文件夹
sudo mkdir jvm 创建jvm文件
sudo tar zxvf ~/下载/jdk-8u91-linux-x64.tar.gz -C /usr/lib/jvm
设置JAVA_HOME:
sudo gedit ~/.bashrc
export JAVA_HOME=/usr/lib/jvm/jdk1.8.0_91,保存退出。
立即生效:source ~/.bashrc
测试JAVA_HOME是否设置成功,输出了上面设置的路径表示成功:
echo $JAVA_HOME
sudo apt-get install openjdk-7-jdk
export JAVA_HOME=/usr/lib/jvm/java-7-openjdk-amd64
java –version
- 安装Hadoop2.7.1
sudo tar zxvf ~/下载/hadoop-2.7.1.tar.gz -C /usr/local
cd /usr/local/
sudo mv ./hadoop-2.7.1/ ./hadoop # 将文件夹名改为hadoop
sudo chown -R hadoop(当前用户名) ./hadoop # 修改文件权限
sudo gedit ~/.bashrc
打开界面后,在之前配置的JAVA_HOME后面输入:
export HADOOP_INSTALL=/usr/local/hadoop
export PATH=$PATH:$HADOOP_INSTALL/bin
export PATH=$PATH:$HADOOP_INSTALL/sbin
export HADOOP_MAPRED_HOME=$HADOOP_INSTALL
export HADOOP_COMMON_HOME=$HADOOP_INSTALL
export HADOOP_HDFS_HOME=$HADOOP_INSTALL
export YARN_HOME=$HADOOP_INSTALL
立即生效:source ~/.bashrc
- 配置伪分布式
切换至配置文件目录: cd /usr/local/hadoop/etc/hadoop
sudo gedit core-site.xml
<configuration>
<property>
<name>hadoop.tmp.dir</name>
<value>file:/usr/local/hadoop/tmp</value>
<description>Abase for other temporary directories.</description>
</property>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
</configuration>
sudo gedit hdfs-site.xml
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:/usr/local/hadoop/tmp/dfs/name</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:/usr/local/hadoop/tmp/dfs/data</value>
</property>
</configuration>
sudo gedit yarn-site.xml
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
</configuration>
mv mapred-site.xml.template mapred-site.xml更换名字
sudo gedit mapred-site.xml
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
- 启动/停止hadoop
格式化。
hdfs namenode -format
start-all.sh 启动所有的Hadoop守护进程。包括NameNode、Secondary NameNode、DataNode、JobTracker、 TaskTrack
stop-all.sh 停止所有的Hadoop守护进程。包括NameNode、Secondary NameNode、DataNode、JobTracker、 TaskTrack
start-dfs.sh 启动Hadoop HDFS守护进程NameNode、SecondaryNameNode和DataNode
stop-dfs.sh 停止Hadoop HDFS守护进程NameNode、SecondaryNameNode和DataNode
hadoop-daemons.sh start namenode 单独启动NameNode守护进程
hadoop-daemons.sh stop namenode 单独停止NameNode守护进程
hadoop-daemons.sh start datanode 单独启动DataNode守护进程
hadoop-daemons.sh stop datanode 单独停止DataNode守护进程
hadoop-daemons.sh startsecondarynamenode单独启动SecondaryNameNode守护进程
hadoop-daemons.sh stop secondarynamenode 单独停止SecondaryNameNode守护进程
start-mapred.sh 启动Hadoop MapReduce守护进程JobTracker和TaskTracker
stop-mapred.sh 停止Hadoop MapReduce守护进程JobTracker和TaskTracker
hadoop-daemons.sh start jobtracker 单独启动JobTracker守护进程
hadoop-daemons.sh stop jobtracker 单独停止JobTracker守护进程
hadoop-daemons.sh start tasktracker 单独启动TaskTracker守护进程
hadoop-daemons.sh stop tasktracker 单独启动TaskTracker守护进程
jps 查看
完整进程如下:
2583 DataNode
2970 ResourceManager
3461 Jps
3177 NodeManager
2361 NameNode
2840 SecondaryNam
若执行jps后提示:
程序 'jps' 已包含在下列软件包中:
* default-jdk
* ecj
* gcj-4.6-jdk
* openjdk-6-jdk
* gcj-4.5-jdk
* openjdk-7-jdk
请尝试:sudo apt-get install <选定的软件包>
那么请执行下面命令,手动设置系统默认JDK:
Sudo update-alternatives --install /usr/bin/jps jps /usr/lib/jvm/jdk1.7.0_79/bin/jps 1
Sudo update-alternatives --install /usr/bin/javac javac /usr/lib/jvm/jdk1.7.0_79/bin/javac 300
sudo update-alternatives --install /usr/bin/java java /usr/lib/jvm/jdk1.7.0_79/bin/java 300
再次执行jps就不会出现提示了。
任务1-2
启动Hadoop
hdfs dfs -mkdir -p /user/hadoop (要使用当前用户的用户名)
hdfs dfs -mkdir -p /input hdfs创建input目录
hdfs dfs -put ~/下载/dat0102.dat /input/ 将本地文件dat0102.dat导入到HDFSinput目录中
hadoop jar /usr/local/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-
examples-2.7.1.jar grep /input/dat0102.dat/ /output/ "HDFS"
调用Hadoop jar包来查询dat0102.dat中的HDFS字段出现的次数,并保存在output目录下
hdfs dfs -cat /output/part-r-00000 输出hdfs字段出现的次数
任务1-3
Hadoop 平台进行性能调优
sudo gedit yarn-site.xml
<property>
<name>yarn.scheduler.maximum-allocation-mb</name>
<value>2048</value>
</property>
sudo gedit mapred-site.xml
<property>
<name>mapreduce.map.memory.mb</name>
<value>1024</value>
</property>
<property>
<name>mapreduce.reduce.memory.mb</name>
<value>2048</value>
</property>
<property>
<name>mapreduce.map.java.opts</name>
<value>-Xmx768m</value>
</property>
<property>
<name>mapreduce.reduce.java.opts</name>
<value>-Xmx1536m</value>
</property>
任务2-4
- 安装hive2.1.1
sudo tar -zxvf ~/下载/apache-hive-2.1.1-bin.tar.gz -C /usr/local
cd /usr/local/
sudo mv apache-hive-2.1.1-bin hive # 将文件夹名改为hive
sudo chown -R hadoop:hadoop hive sudo chmod 774 hadoop # 修改文件权限
- 配置hive环境
sudo apt-get install vim 安装vim
vim ~/.bashrc
export HIVE_HOME=/usr/local/hive
export PATH=$PATH:$HIVE_HOME/bin
source ~/.bashrc
- 配置Hive
cd /usr/local/hive/conf
mv hive-env.sh.template hive-env.sh
mv hive-default.xml.template hive-site.xml
mv hive-log4j2.properties.template hive-log4j2.properties
mv hive-exec-log4j2.properties.template hive-exec-log4j2.properties
- 修改hive-env.sh
export JAVA_HOME=/usr/lib/jvm/jdk1.7.0_79 ##Java路径
export JAVA_HOME=/usr/lib/jvm/java-7-openjdk-amd64 ##Java路径
export HADOOP_HOME=/usr/local/hadoop ##Hadoop安装路径
export HIVE_HOME=/usr/local/hive ##Hive安装路径
export HIVE_CONF_DIR=/usr/local/hive/conf ##Hive配置文件路径
- 修改hive-site.xml
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://localhost:3306/hive?createDatabaseIfNotExist=true</value>
<description>JDBC connect string for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.jdbc.Driver</value>
<description>Driver class name for a JDBC metastore</description>
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>hive</value>
<description>username to use against metastore database</description>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>hive</value>
<description>password to use against metastore database</description>
</property>
</configuration>
- 安装并配置mysql
sudo apt-get install mysql-server #安装mysql
service mysql start 启动MySQL
service mysql stop 停止MySQL
sudo netstat -tap | grep mysql 查看是否启动成功
mysql -u root –p 进入MySQL shell 页面
- 创建一个 hive 数据库用来存储 Hive 元数据,且数据库访问的用户名和密码都为 hive。
mysql> CREATE DATABASE hive;
mysql> USE hive;
mysql> CREATE USER 'hive'@'localhost' IDENTIFIED BY 'hive';
mysql> GRANT ALL ON hive.* TO 'hive'@'localhost' IDENTIFIED BY 'hive';
mysql> GRANT ALL ON hive.* TO 'hive'@'%' IDENTIFIED BY 'hive';
mysql> FLUSH PRIVILEGES;
mysql> quit;
- 安装MySQL jdbc包
tar -zxvf ~/下载/mysql-connector-java-5.1.39.tar.gz –c /usr/local/hive解压
cp /usr/local/hive/mysql-connector-java-5.1.40/mysql-connector-java-5.1.40 -bin.jar /usr/local/hive/lib #将mysql-connector-java-5.1.40-bin.jar拷贝到/usr/local/hive/lib目录下
- 运行之前先初始化操作
schematool -initSchema -dbType mysql
- 启动hadoop
start-all.sh
- 启动hive
1、在Hadoop平台创建result目录
hdfs dfs -mkdir -p /result
2、创建Hive数据表(表名为:movie)
create table movie(name string,time string,score string)
row format delimited fields terminated by ',';
3、加载数据
load data local inpath 'home/hadoop/Downloads/dat0204.log' into table movie;
4、查询数据
select * from movie where time>='2014.1.1' and time<='2014.12.31' order by time;
5、导入Hadoop平台的result目录
insert overwrite directory "/result"
row format delimited fields terminated by ',' select * from movie;
6、jar包
hadoop jar /usr/local/Hadoop/share/Hadoop/tools/lib/hadoop-streaming-2.7.0.jar \
-file ~/Dowloads/ans0203map.py \
-mapper ‘python ans0203map.py’ \
-file ~/Dowloads/ans0203reduce.py \
-reducer ‘python ans0203reduce.py’ \
-input /input/dat0203.log \
-output /output