hdfs常用API操作练习

本文链接： https://blog.csdn.net/a805814077/article/details/101915900

1.文件的上传与下载

使用流的方式上传文件
使用流的方式下载文件
从随机地方开始读，读任意长度

2.手动拷贝某个特定的数据块,比如某个文件的第二个数据块,只拷贝第二个数据块

3.删除HDFS集群中的所有空文件和空目录
在这里插入图片描述
我添加了默认配置文件，所以在代码中没有conf.set操作

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
 <property>
 <name>fs.defaultFS</name>
 <value>hdfs://hadoop01:9000</value>
</property>
<property>
 <name>hadoop.tmp.dir</name>
 <value>/home/hadoop/data/hadoopdata</value>
</property>
</configuration>

package com.hadoop.hdfs;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Random;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

/**
 * 1.使用流的方式的上传文件
 * 2.使用流的方式的下载文件
 * 3.从随机地方开始读，读任意长度
 */
public class Test1 {
	Configuration conf = null;
	FileSystem fs = null;

	@Before
	public void start() throws IOException {
		// 设置访问级别
		System.setProperty("HADOOP_USER_NAME", "hadoop");
		// 获取连接
		conf = new Configuration();
		fs = FileSystem.get(conf);
	}

	@Test
	public void test1() throws IOException, InterruptedException, URISyntaxException {
		// 本地输入流
		FileInputStream in = new FileInputStream("F:/test/words/1.txt");
		// hdfs输出流
		FSDataOutputStream out = fs.create(new Path("/testout11/2.txt"));
		IOUtils.copyBytes(in, out, 1024, true);
	}

	@Test
	public void test2() throws IOException, InterruptedException, URISyntaxException {
		// hdfs输入流
		FSDataInputStream in = fs.open(new Path("/testout11/1.txt"));
		// 本地输出流
		FileOutputStream out = new FileOutputStream(new File("/test/words/6.txt"));
		IOUtils.copyBytes(in, out, 1024, true);
	}

	@Test
	public void test3() throws IOException, InterruptedException, URISyntaxException {
		// hdfs输入流
		FSDataInputStream in = fs.open(new Path("/testout11/1.txt"));
		// 从任意位置开始读
		in.seek(new Random().nextInt(30));
		// hdfs输出流
		FSDataOutputStream out = fs.create(new Path("/testout11/2.txt"));
		// (输入流，输出流，从任意地方开始输入,关闭流)
		IOUtils.copyBytes(in, out, new Long(new Random().nextInt(10)), true);

	}
}

package com.hadoop.hdfs;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;

/**
 * 手动拷贝某个特定的数据块
 */
public class Test2 {
	public static void main(String[] args) throws IOException {
		// 设置访问级别，以hadoop的身份进行操作
		System.setProperty("HADOOP_USER_NAME", "hadoop");
		Configuration conf = new Configuration();
		FileSystem fs = FileSystem.get(conf);
		Path path = new Path("/testout11/a.tar.gz");
		// hdfs输入流
		FSDataInputStream in = fs.open(path);
		// 拿到这个列表里面的所有文件
		FileStatus[] listStatus = fs.listStatus(path);
		// 拿到文件列表中的第一个文件，很显然我这里只有唯一一个文件
		BlockLocation[] locations = fs.getFileBlockLocations(listStatus[0], 0, listStatus[0].getLen());
		// 获得第二个block块的长度
		long length = locations[1].getLength();
		// 获得第二个block块的偏移量长度
		long offset = locations[1].getOffset();
		// 让输入流从这里开始
		in.seek(offset);
		// 输出到本地
		FileOutputStream out = new FileOutputStream(new File("F:/test/a.tar.gz"));
		// 这里的第三个参数代表从哪里开始拷贝
		IOUtils.copyBytes(in, out, length, true);
	}
}

package com.hadoop.hdfs;

import java.io.FileNotFoundException;
import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;

/**
 * 删除HDFS集群中的所有空文件和空目录
 */
public class Test3 {
	public static void main(String[] args) throws IOException {
		// 设置访问级别，以hadoop的身份进行操作
		System.setProperty("HADOOP_USER_NAME", "hadoop");
		Configuration conf = new Configuration();
		FileSystem fs = FileSystem.get(conf);
		deleteEmpty(fs, new Path("/test"));
		fs.close();
	}

	public static void deleteEmpty(FileSystem fs, Path path) throws FileNotFoundException, IOException {
		// 列出path路径的下的文件
		FileStatus[] listStatus = fs.listStatus(path);
		// 如果数组的长度为0说明什么也没有直接递归删除结束即可
		if (listStatus.length == 0) {
			fs.delete(path, true);
			return;
		}
		// 如果不为0就会走这里，这里使用了一个迭代器，列出path下被阻止的路径(官方的解释，大概的意思就是从这里开始列表里面有文件)
		RemoteIterator<LocatedFileStatus> rit = fs.listLocatedStatus(path);
		// 开始迭代
		while (rit.hasNext()) {
			// 获取下一个文件的路径
			LocatedFileStatus next = rit.next();
			Path currentlen = next.getPath();
			// 获取上一级目录的路径,因为如果我们删除了当前路径的东西的话可能导致上一级路径里面的东西为空
			Path parentlen = currentlen.getParent();
			// 判断文件是否是文件夹
			if (next.isDirectory()) {
				// 如果是文件夹长度为0就可以直接删除了
				if (next.getLen() == 0)
					fs.delete(currentlen, true);
				else
					// 如果不为0就进行递归,因为可能存在不是文件夹的文件
					deleteEmpty(fs, currentlen);
			} else {
				// 如果不是文件夹且长度为0就直接删除
				if (next.getLen() == 0)
					fs.delete(currentlen, true);
			}
			FileStatus[] listStatus2 = fs.listStatus(parentlen);
			// 在进行了当前目录的一系列操作之后要看一下上一级是否因为这一级的变化变成了空
			if (listStatus2.length == 0)
				fs.delete(parentlen, true);
		}
	}
}

附:hdfsAPI帮助文档

hdfs常用API操作练习

猜你喜欢