Twenty-five. Case: Establish a search index for multiple log files

Pay attention to the column "Broken Cocoon and Become a Butterfly-Big Data" to view related series of articles~


table of Contents

1. Demand analysis

Second, the code implementation


 

1. Demand analysis

There are a bunch of log files, and some words or characters in the log files need to be counted and corresponded to each log file one-to-one to realize the establishment of the index. The contents of the original log file are as follows:

The expected result data is:

Analysis of realization ideas: The above requirements can be realized in two steps. 1. First, count the number of times the word bought appears in different log files. 2. Group and merge data according to different words.

Second, the code implementation

2.1 Mapper method in the first step

package com.xzw.hadoop.mapreduce.inverted_index;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

/**
 * @author: xzw
 * @create_date: 2020/12/7 8:40
 * @desc:
 * @modifier:
 * @modified_date:
 * @desc:
 */
public class OneIndexMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    String name;
    Text k = new Text();
    IntWritable v = new IntWritable();

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        //获取文件名称
        FileSplit split = (FileSplit) context.getInputSplit();
        name = split.getPath().getName();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //1、获取一行数据并进行切割
        String line = value.toString();
        String[] fields = line.split(" ");

        for (String word: fields) {
            //2、进行字符串的拼接
            k.set(word + "--" + name);
            v.set(1);

            //3、写出
            context.write(k, v);
        }
    }
}

2.2 Reducer method in the first step

package com.xzw.hadoop.mapreduce.inverted_index;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * @author: xzw
 * @create_date: 2020/12/7 8:49
 * @desc:
 * @modifier:
 * @modified_date:
 * @desc:
 */
public class OneIndexReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    IntWritable v = new IntWritable();

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int sum = 0; //累加器

        for (IntWritable value: values) {
            sum += value.get();
        }

        v.set(sum);

        context.write(key, v);
    }
}

2.3 Driver class in the first step

package com.xzw.hadoop.mapreduce.inverted_index;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * @author: xzw
 * @create_date: 2020/12/7 9:00
 * @desc:
 * @modifier:
 * @modified_date:
 * @desc:
 */
public class OneIndexDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        args = new String[]{"C:\\Users\\Machenike\\Desktop\\file", "C:\\Users\\Machenike\\Desktop\\file\\output1"};

        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);

        job.setJarByClass(OneIndexDriver.class);

        job.setMapperClass(OneIndexMapper.class);
        job.setReducerClass(OneIndexReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        boolean b = job.waitForCompletion(true);
        System.exit(b ? 0 : 1);
    }
}

2.4 The first step test results

2.5 Mapper method in the second step

package com.xzw.hadoop.mapreduce.inverted_index;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * @author: xzw
 * @create_date: 2020/12/7 9:21
 * @desc:
 * @modifier:
 * @modified_date:
 * @desc:
 */
public class TwoIndexMapper extends Mapper<LongWritable, Text, Text, Text> {
    Text k = new Text();
    Text v = new Text();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] fields = line.split("--");

        k.set(fields[0]);
        v.set(fields[1]);

        context.write(k, v);
    }
}

2.6 Reducer method in the second step

package com.xzw.hadoop.mapreduce.inverted_index;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * @author: xzw
 * @create_date: 2020/12/7 9:26
 * @desc:
 * @modifier:
 * @modified_date:
 * @desc:
 */
public class TwoIndexReducer extends Reducer<Text, Text, Text, Text> {
    Text v = new Text();

    @Override
    protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
        StringBuilder sb = new StringBuilder();

        for (Text value: values) {
            sb.append(value.toString().replace("\t", "-->") + "\t");
        }

        v.set(sb.toString());
        context.write(key, v);
    }
}

2.7 Driver class in the second step

package com.xzw.hadoop.mapreduce.inverted_index;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * @author: xzw
 * @create_date: 2020/12/7 9:38
 * @desc:
 * @modifier:
 * @modified_date:
 * @desc:
 */
public class TwoIndexDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        args = new String[]{"C:\\Users\\Machenike\\Desktop\\file\\output1", "C:\\Users\\Machenike\\Desktop\\file" +
                "\\output2"};

        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);

        job.setJarByClass(TwoIndexDriver.class);
        job.setMapperClass(TwoIndexMapper.class);
        job.setReducerClass(TwoIndexReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        boolean b = job.waitForCompletion(true);
        System.exit(b ? 0 : 1);
    }
}

2.8 Test results of the second step

Guess you like

Origin blog.csdn.net/gdkyxy2013/article/details/110820214