通过java操作hadoop

java操作hadoop真的比python麻烦太多,试了好久一直不成功,今天实验成功,做一下笔记
1 作为初学者一定要导入common和HDFS目录的所有库,要不出现很多依赖库问题,库的位置在下载后的源码解压得到,比如我安装的是hadoop2.84,解压后的路径是位置在G:\project\hadoop\hadoop-2.8.4\,把G:\project\hadoop\hadoop-2.8.4\share\hadoop\common\lib、G:\project\hadoop\hadoop-2.8.4\share\hadoop\common、G:\project\hadoop\hadoop-2.8.4\share\hadoop\hdfs和G:\project\hadoop\hadoop-2.8.4\share\hadoop\hdfs\lib下面的jar包全部导入项目
2 项目根路径创建文件log4j.properties,内容如下(要不提示日志文件不存在的东东,然后在代码里面加入BasicConfigurator.configure();):
# Configure logging for testing: optionally with log file

#log4j.rootLogger=debug,appender
log4j.rootLogger=info,appender
#log4j.rootLogger=error,appender

#\u8F93\u51FA\u5230\u63A7\u5236\u53F0
log4j.appender.appender=org.apache.log4j.ConsoleAppender
#\u6837\u5F0F\u4E3ATTCCLayout
log4j.appender.appender.layout=org.apache.log4j.TTCCLayout

3测试代码如下,在项目中新建test.java文件,里面的hadoop2.com换成core-site.xml配置文件里面fs.defaultFS的配置:

package WordCount;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.log4j.BasicConfigurator;

public class test {

public static void main(String[] args) {
// TODO 自动生成的方法存根
System.out.println("Hello World ");
BasicConfigurator.configure();
try {
listAllFile();
} catch (Exception e) {
// TODO 自动生成的 catch 块
e.printStackTrace();
}
System.out.println("Hello World ");

}

/**
* 获取HDFS文件系统
* @return
* @throws IOException
* @throws URISyntaxException
*/
public static FileSystem getFileSystem() throws IOException, URISyntaxException{
//read config file
Configuration conf = new Configuration();

//返回默认文件系统
//如果在Hadoop集群下运行,使用此种方法可以直接获取默认文件系统
// FileSystem fs = FileSystem.get(conf);

//指定的文件系统地址
URI uri = new URI("hdfs://hadoop2.com:9000");

//返回指定的文件系统
//如果在本地测试,需要使用此种方法获取文件系统
FileSystem fs = FileSystem.get(uri, conf);

return fs;
}

/**
* 创建文件目录
* @throws Exception
*/
public static void mkdir() throws Exception{
//获取文件系统
FileSystem fs = getFileSystem();

//创建文件目录
fs.mkdirs(new Path("hdfs://hadoop2.com:9000/test/weibo"));

//释放资源
fs.close();
}

/**
* 删除文件或者文件目录
* @throws Exception
*/
public static void rmdir() throws Exception{
//获取文件系统
FileSystem fs = getFileSystem();

//删除文件或者文件目录
fs.delete(new Path("hdfs://hadoop2.com:9000/test/weibo"), true);

//释放资源
fs.close();
}


/**
* 获取目录下所有文件
* @throws Exception
*/
public static void listAllFile() throws Exception{
//获取文件系统
FileSystem fs = getFileSystem();

//列出目录内容
FileStatus[] status = fs.listStatus(new Path("hdfs://hadoop2.com:9000/test/"));

//获取目录下所有文件路径
Path[] listedPaths = FileUtil.stat2Paths(status);

//循环读取每个文件
for (Path path : listedPaths) {
System.out.println(path);
}

//释放资源
fs.close();
}

/**
* 将文件上传至HDFS
* @throws Exception
*/
public static void copyToHDFS() throws Exception{
//获取文件对象
FileSystem fs = getFileSystem();

//源文件路径是Linux下的路径 Path srcPath = new Path("/home/hadoop/temp.jar");
//如果需要在windows下测试,需要改为Windows下的路径,比如 E://temp.jar
Path srcPath = new Path("E://temp.jar");

//目的路径
Path dstPath = new Path("hdfs://hadoop2.com:9000/test/weibo");

//实现文件上传
fs.copyFromLocalFile(srcPath, dstPath);

//释放资源
fs.close();

}

/**
* 从HDFS上下载文件
* @throws Exception
*/
public static void getFile() throws Exception{
//获得文件系统
FileSystem fs = getFileSystem();

//源文件路径
Path srcPath = new Path("hdfs://hadoop2.com:9000/test/weibo/temp.jar");

//目的路径,默认是Linux下的
//如果在Windows下测试,需要改为Windows下的路径,如C://User/andy/Desktop/
Path dstPath = new Path("D://");

//下载HDFS上的文件
fs.copyToLocalFile(srcPath, dstPath);

//释放资源
fs.close();
}

/**
* 获取HDFS集群点的信息
* @throws Exception
*/
public static void getHDFSNodes() throws Exception{
//获取文件系统
FileSystem fs = getFileSystem();

//获取分布式文件系统
DistributedFileSystem hdfs = (DistributedFileSystem)fs;

//获取所有节点
DatanodeInfo[] dataNodeStats = hdfs.getDataNodeStats();

//循环比遍历
for (int i = 0; i < dataNodeStats.length; i++) {
System.out.println("DataNote_" + i + "_Name:" + dataNodeStats[i].getHostName());
}

//释放资源
fs.close();
}

/**
* 查找某个文件在HDFS集群的位置
* @throws Exception
*/
public static void getFileLocal() throws Exception{
//获取文件系统
FileSystem fs = getFileSystem();

//文件路径
Path path = new Path("hdfs://hadoop2.com:9000/test/weibo/temp.jar");

//获取文件目录
FileStatus fileStatus = fs.getFileStatus(path);

//获取文件块位置列表
BlockLocation[] blockLocations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());

//循环输出块信息
for (int i = 0; i < blockLocations.length; i++) {
String[] hosts = blockLocations[i].getHosts();
System.out.println("block_" + i + "_location:" + hosts[0]);
}

//释放资源
fs.close();
}

}

4结果:
Hello World
0 [main] DEBUG org.apache.hadoop.metrics2.lib.MutableMetricsFactory - field org.apache.hadoop.metrics2.lib.MutableRate org.apache.hadoop.security.UserGroupInformation$UgiMetrics.loginSuccess with annotation @org.apache.hadoop.metrics2.annotation.Metric(about=, always=false, sampleName=Ops, type=DEFAULT, value=[Rate of successful kerberos logins and latency (milliseconds)], valueName=Time)
24 [main] DEBUG org.apache.hadoop.metrics2.lib.MutableMetricsFactory - field org.apache.hadoop.metrics2.lib.MutableRate org.apache.hadoop.security.UserGroupInformation$UgiMetrics.loginFailure with annotation @org.apache.hadoop.metrics2.annotation.Metric(about=, always=false, sampleName=Ops, type=DEFAULT, value=[Rate of failed kerberos logins and latency (milliseconds)], valueName=Time)
25 [main] DEBUG org.apache.hadoop.metrics2.lib.MutableMetricsFactory - field org.apache.hadoop.metrics2.lib.MutableRate org.apache.hadoop.security.UserGroupInformation$UgiMetrics.getGroups with annotation @org.apache.hadoop.metrics2.annotation.Metric(about=, always=false, sampleName=Ops, type=DEFAULT, value=[GetGroups], valueName=Time)
25 [main] DEBUG org.apache.hadoop.metrics2.lib.MutableMetricsFactory - field private org.apache.hadoop.metrics2.lib.MutableGaugeLong org.apache.hadoop.security.UserGroupInformation$UgiMetrics.renewalFailuresTotal with annotation @org.apache.hadoop.metrics2.annotation.Metric(about=, always=false, sampleName=Ops, type=DEFAULT, value=[Renewal failures since startup], valueName=Time)
25 [main] DEBUG org.apache.hadoop.metrics2.lib.MutableMetricsFactory - field private org.apache.hadoop.metrics2.lib.MutableGaugeInt org.apache.hadoop.security.UserGroupInformation$UgiMetrics.renewalFailures with annotation @org.apache.hadoop.metrics2.annotation.Metric(about=, always=false, sampleName=Ops, type=DEFAULT, value=[Renewal failures since last successful login], valueName=Time)
27 [main] DEBUG org.apache.hadoop.metrics2.impl.MetricsSystemImpl - UgiMetrics, User and group related metrics
117 [main] DEBUG org.apache.hadoop.security.authentication.util.KerberosName - Kerberos krb5 configuration not found, setting default realm to empty
123 [main] DEBUG org.apache.hadoop.security.Groups - Creating new Groups object
129 [main] DEBUG org.apache.hadoop.util.NativeCodeLoader - Trying to load the custom-built native-hadoop library...
152 [main] DEBUG org.apache.hadoop.util.NativeCodeLoader - Loaded the native-hadoop library
153 [main] DEBUG org.apache.hadoop.security.JniBasedUnixGroupsMapping - Using JniBasedUnixGroupsMapping for Group resolution
153 [main] DEBUG org.apache.hadoop.security.JniBasedUnixGroupsMappingWithFallback - Group mapping impl=org.apache.hadoop.security.JniBasedUnixGroupsMapping
202 [main] DEBUG org.apache.hadoop.security.Groups - Group mapping impl=org.apache.hadoop.security.JniBasedUnixGroupsMappingWithFallback; cacheTimeout=300000; warningDeltaMs=5000
211 [main] DEBUG org.apache.hadoop.security.UserGroupInformation - hadoop login
212 [main] DEBUG org.apache.hadoop.security.UserGroupInformation - hadoop login commit
213 [main] DEBUG org.apache.hadoop.security.UserGroupInformation - Using user: "server" with name server
213 [main] DEBUG org.apache.hadoop.security.UserGroupInformation - User entry: "server"
213 [main] DEBUG org.apache.hadoop.security.UserGroupInformation - Assuming keytab is managed externally since logged in from subject.
214 [main] DEBUG org.apache.hadoop.security.UserGroupInformation - UGI loginUser:server (auth:SIMPLE)
250 [main] DEBUG org.apache.htrace.core.Tracer - sampler.classes = ; loaded no samplers
413 [main] DEBUG org.apache.htrace.core.Tracer - span.receiver.classes = ; loaded no span receivers
841 [main] DEBUG org.apache.hadoop.hdfs.client.impl.DfsClientConf - dfs.client.use.legacy.blockreader.local = false
841 [main] DEBUG org.apache.hadoop.hdfs.client.impl.DfsClientConf - dfs.client.read.shortcircuit = false
841 [main] DEBUG org.apache.hadoop.hdfs.client.impl.DfsClientConf - dfs.client.domain.socket.data.traffic = false
841 [main] DEBUG org.apache.hadoop.hdfs.client.impl.DfsClientConf - dfs.domain.socket.path =
854 [main] DEBUG org.apache.hadoop.hdfs.DFSClient - Sets dfs.client.block.write.replace-datanode-on-failure.min-replication to 0
890 [main] DEBUG org.apache.hadoop.io.retry.RetryUtils - multipleLinearRandomRetry = null
929 [main] DEBUG org.apache.hadoop.ipc.Server - rpcKind=RPC_PROTOCOL_BUFFER, rpcRequestWrapperClass=class org.apache.hadoop.ipc.ProtobufRpcEngine$RpcProtobufRequest, rpcInvoker=org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker@489115ef
1166 [main] DEBUG org.apache.hadoop.ipc.Client - getting client out of cache: org.apache.hadoop.ipc.Client@b2c9a9c
1784 [main] DEBUG org.apache.hadoop.util.PerformanceAdvisory - Both short-circuit local reads and UNIX domain socket are disabled.
1790 [main] DEBUG org.apache.hadoop.hdfs.protocol.datatransfer.sasl.DataTransferSaslUtil - DataTransferProtocol not using SaslPropertiesResolver, no QOP found in configuration for dfs.data.transfer.protection
1849 [main] DEBUG org.apache.hadoop.ipc.Client - The ping interval is 60000 ms.
1860 [main] DEBUG org.apache.hadoop.ipc.Client - Connecting to hadoop2.com/192.168.129.130:9000
1969 [IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server] DEBUG org.apache.hadoop.ipc.Client - IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server: starting, having connections 1
1973 [IPC Parameter Sending Thread #0] DEBUG org.apache.hadoop.ipc.Client - IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server sending #0 org.apache.hadoop.hdfs.protocol.ClientProtocol.getListing
1984 [IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server] DEBUG org.apache.hadoop.ipc.Client - IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server got value #0
1984 [main] DEBUG org.apache.hadoop.ipc.ProtobufRpcEngine - Call: getListing took 170ms
hdfs://hadoop2.com:9000/test/output
hdfs://hadoop2.com:9000/test/start.txt
hdfs://hadoop2.com:9000/test/test.txt
2027 [main] DEBUG org.apache.hadoop.ipc.Client - stopping client from cache: org.apache.hadoop.ipc.Client@b2c9a9c
2028 [main] DEBUG org.apache.hadoop.ipc.Client - removing client from cache: org.apache.hadoop.ipc.Client@b2c9a9c
2028 [main] DEBUG org.apache.hadoop.ipc.Client - stopping actual client because no more references remain: org.apache.hadoop.ipc.Client@b2c9a9c
2028 [main] DEBUG org.apache.hadoop.ipc.Client - Stopping client
2028 [IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server] DEBUG org.apache.hadoop.ipc.Client - IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server: closed
2028 [IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server] DEBUG org.apache.hadoop.ipc.Client - IPC Client (2093010349) connection to hadoop2.com/192.168.129.130:9000 from server: stopped, remaining connections 0
Hello World
2131 [Thread-2] DEBUG org.apache.hadoop.util.ShutdownHookManager - ShutdownHookManger complete shutdown.

猜你喜欢

转载自www.cnblogs.com/kuainiao/p/9417767.html