springboot集成hadoop实现hdfs功能

pom.xml:

<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.8.5</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>2.8.5</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.8.5</version>
        </dependency>

application.properties:


server.port=8569

#hdfs
hadoop.name-node: hdfs://192.168.4.252:9000
hadoop.namespace: /bestdir

HadoopConfig:

package com.zkaw.hadoop.config;

import lombok.extern.slf4j.Slf4j;
import org.apache.hadoop.fs.FileSystem;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import java.net.URI;

@Configuration
@ConditionalOnProperty(name="hadoop.name-node")
@Slf4j
public class HadoopConfig {
    @Value("${hadoop.name-node}")
    private String nameNode;
    /*** Configuration conf=new Configuration（）；
     * 创建一个Configuration对象时，其构造方法会默认加载hadoop中的两个配置文件，
     * 分别是hdfs-site.xml以及core-site.xml，这两个文件中会有访问hdfs所需的参数值，
     * 主要是fs.default.name，指定了hdfs的地址，有了这个地址客户端就可以通过这个地址访问hdfs了。
     * 即可理解为configuration就是hadoop中的配置信息。
     * @return
     */
    @Bean("fileSystem")
    public FileSystem createFs() throws Exception{
        //读取配置文件
        org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();
        conf.set("fs.defaultFS", nameNode);
        conf.set("dfs.replication", "1");
        conf.set("dfs.client.use.datanode.hostname", "true");
        FileSystem fs = null;
        // 指定访问hdfs的客户端身份
        fs = FileSystem.get(new URI(nameNode), conf, "root");
        // 文件系统// 返回指定的文件系统,如果在本地测试，需要使用此种方法获取文件系统
        try {
            URI uri = new URI(nameNode.trim());
            fs = FileSystem.get(uri,conf,"root");
        } catch (Exception e) {
            log.error("", e);
        }
        System.out.println("fs.defaultFS: "+conf.get("fs.defaultFS"));
        return  fs;
    }
}

HdfsController：

package com.zkaw.hadoop.controller;

import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;

import java.io.IOException;

/**
 * @Author: best_liu
 * @Description:
 * @Date Create in 14:20 2023/4/21
 * @Modified By:
 */
@RequestMapping("/hdfs")
@RestController
@Slf4j
public class HdfsController {
    @Value("${hadoop.name-node}")
    private String nameNode;
    @Value("${hadoop.namespace:/}")
    private String nameSpace;

    @Autowired
    private FileSystem fileSystem;


    /*** 将本地文件srcFile,上传到hdfs
     * @param srcFile
     * @return
     */
    @PostMapping("/upload")
    public String upload( String srcFile){
        srcFile = "D:\\test.txt";
        uploadFile(srcFile);
        return "upload";
    }

    public void uploadFile(String srcFile){
        this.copyFileToHDFS(false,true,srcFile,nameSpace);
    }
    public  void copyFileToHDFS(boolean delSrc, boolean overwrite,String srcFile,String destPath) {
        // 源文件路径是Linux下的路径，如果在 windows 下测试，需要改写为Windows下的路径，比如D://hadoop/djt/weibo.txt
        Path srcPath = new Path(srcFile);
        // 目的路径
        if(StringUtils.isNotBlank(nameNode)){
            destPath = nameNode + destPath;
        }
        Path dstPath = new Path(destPath);
        // 实现文件上传
        try {
            // 获取FileSystem对象
            fileSystem.copyFromLocalFile(srcPath, dstPath);
            fileSystem.copyFromLocalFile(delSrc,overwrite,srcPath, dstPath);
            //释放资源//
//            fileSystem.close();
        } catch (IOException e) {
            log.error("", e);
        }
    }

    @PostMapping("/delFile")
    public String del(String fileName){
        rmdir(nameSpace,"test.txt") ;
        return "delFile";
    }

    public void rmdir(String path,String fileName) {
        try {
            // 返回FileSystem对象
            if(StringUtils.isNotBlank(nameNode)){
                path = nameNode + path;
            }
            if(StringUtils.isNotBlank(fileName)){
                path =  path + "/" +fileName;
            }
            // 删除文件或者文件目录  delete(Path f) 此方法已经弃用
            fileSystem.delete(new Path(path),true);
        } catch (IllegalArgumentException | IOException e) {
            log.error("", e);
        }
    }

    @PostMapping("/download")
    public String download(String fileName,String savePath){
        getFile(nameSpace+"/"+"test.txt","D:\\work\\lxjTest\\hadoopmaster");
        return "download";
    }
    /*** 从 HDFS 下载文件
     ** @param hdfsFile
     * @param destPath 文件下载后,存放地址
     */
    public void getFile(String hdfsFile,String destPath) {
        // 源文件路径
        if(StringUtils.isNotBlank(nameNode)){
            hdfsFile = nameNode + hdfsFile;
        }
        Path hdfsPath = new Path(hdfsFile);
        Path dstPath = new Path(destPath);
        try {
            // 下载hdfs上的文件
            fileSystem.copyToLocalFile(hdfsPath, dstPath);
            // 释放资源//
            fileSystem.close();
        } catch (IOException e) {
            log.error("", e);
        }
    }
}

遇到的问题

这个时候你可能会高高兴兴的用postMan进行测试,发现报了下面这个错误:

 File /test/test.txt could only be replicated to 0 nodes instead of minReplication (=1).  There are 1 datanode(s) running and 1 node(s) are excluded in this operation.

但是你到回到前面访问过的那个50070web页面,发现目录里是有text.txt的,但是大小却为0

在这里插入图片描述

原因分析:

我们知道,客户端对Hdfs文件系统访问的大概流程是:

客户端通过公网ip+端口与namenode进行通信
namenode返回datanode的地址,注意是hdfs文件系统的内网地址!!!
客户端根据地址去和datanode进行连接
但是,我们是在自己的电脑去访问虚拟机,用的是公网的地址,这样当然是无法访问hdfs的内网地址,也无法和datanode建立正常的输送连接,这也是为什么namenode上有目录,但是大小却为0的原因,也是为什么web界面无法下载文件的原因

所以,我们要去想办法让namenode不要返回datanode的内网地址,而是公网的地址,这样我们才能访问和连接

解决办法:

想办法使本地可以访问到 DataNode。

1.添加一句配置，使 NameNode 返回 DataNode 的主机名而不是 IP：

conf.set("dfs.client.use.datanode.hostname", "true");

另外也可以配置 hdfs-site.xml 文件内容如下：

<property>
	<name>dfs.client.use.datanode.hostname</name>
	<value>true</value>
</property>

2.本地可以拿到了 DataNode 的主机名，要访问还需要配置本地 Hosts 映射：

windows 下 hosts 文件地址：C:\Windows\System32\drivers\etc\hosts
你的虚拟机公网ip master
192.168.4.xx master

3.结果访问正常