3.1.5 CombineTextInputFormat practical operation case
Example: count the number of words
Preparations create input folder in the root directory of hdfs, then placed four in the size of which were 1.5M, 35M, 5.5M, 6.5M of small files as input data
Specific code
public class WordCountMapper extends Mapper < LongWritable, Text, Text, IntWritable> {
private Text mapOutputKey = new Text ( ) ;
private IntWritable mapOutputValue = new IntWritable ( ) ;
@Override
protected void map ( LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String linevalue = value. toString ( ) ;
StringTokenizer st = new StringTokenizer ( linevalue) ;
while ( st. hasMoreTokens ( ) ) {
String word = st. nextToken ( ) ;
mapOutputKey. set ( word) ;
mapOutputValue. set ( 1 ) ;
context. write ( mapOutputKey, mapOutputValue) ;
}
}
}
public class WordCountReducer extends Reducer < Text, IntWritable, Text, IntWritable> {
private IntWritable outputValue = new IntWritable ( ) ;
@Override
protected void reduce ( Text key, Iterable< IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0 ;
for ( IntWritable value : values) {
sum += value. get ( ) ;
}
outputValue. set ( sum) ;
context. write ( key, outputValue) ;
}
}
public class WordCountDriver {
public static void main ( String[ ] args) throws Exception {
args = new String [ ] {
"/input/" ,
"/output/"
} ;
Configuration cfg = new Configuration ( ) ;
Job job = Job. getInstance ( cfg, WordCountDriver. class . getSimpleName ( ) ) ;
job. setJarByClass ( WordCountDriver. class ) ;
job. setInputFormatClass ( CombineTextInputFormat. class ) ;
CombineTextInputFormat. setMaxInputSplitSize ( job, 20 * 1024 * 1024 ) ;
job. setMapperClass ( WordCountMapper. class ) ;
job. setMapOutputKeyClass ( Text. class ) ;
job. setMapOutputValueClass ( IntWritable. class ) ;
job. setReducerClass ( WordCountReducer. class ) ;
job. setOutputKeyClass ( Text. class ) ;
job. setOutputValueClass ( IntWritable. class ) ;
FileInputFormat. addInputPath ( job, new Path ( args[ 0 ] ) ) ;
FileOutputFormat. setOutputPath ( job, new Path ( args[ 1 ] ) ) ;
boolean issucess = job. waitForCompletion ( true ) ;
int status= issucess ? 0 : 1 ;
System. exit ( status) ;
}
}
operation result