大数据面试总结《十四》

1 使用Hive或者自定义MR实现如下逻辑
product_no lac_id moment start_time user_id county_id staytime city_id
13429100031 22554 8 2013-03-11 08:55:19.151754088 571 571 282 571
13429100082 22540 8 2013-03-11 08:58:20.152622488 571 571 270 571
13429100082 22691 8 2013-03-11 08:56:37.149593624 571 571 103 571
13429100087 22705 8 2013-03-11 08:56:51.139539816 571 571 220 571
13429100087 22540 8 2013-03-11 08:55:45.150276800 571 571 66 571
13429100082 22540 8 2013-03-11 08:55:38.140225200 571 571 133 571
13429100140 26642 9 2013-03-11 09:02:19.151754088 571 571 18 571
13429100082 22691 8 2013-03-11 08:57:32.151754088 571 571 287 571
13429100189 22558 8 2013-03-11 08:56:24.139539816 571 571 48 571
13429100349 22503 8 2013-03-11 08:54:30.152622440 571 571 211 571
字段解释:
product_no:用户手机号;
lac_id:用户所在基站;
start_time:用户在此基站的开始时间;
staytime:用户在此基站的逗留时间。

需求描述:
根据lac_id和start_time知道用户当时的位置,根据staytime知道用户各个基站的逗留时长。根据轨迹合并连续基站的staytime。
最终得到每一个用户按时间排序在每一个基站驻留时长

期望输出举例:
13429100082 22540 8 2013-03-11 08:58:20.152622488 571 571 270 571
13429100082 22691 8 2013-03-11 08:56:37.149593624 571 571 390 571
13429100082 22540 8 2013-03-11 08:55:38.140225200 571 571 133 571
13429100087 22705 8 2013-03-11 08:56:51.139539816 571 571 220 571
13429100087 22540 8 2013-03-11 08:55:45.150276800 571 571 66 571
答案
package org.aboutyun;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;

public class TimeCount {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();

    Job job = new Job(conf, "time_count");

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.waitForCompletion(true);
}

public static class Map extends Mapper<LongWritable, Text, Text, Text> {
    private Text id = new Text();
    private Text row = new Text();

    public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] items = line.split("\t");

        if (items.length == 8) {
            if (StringUtils.isNumeric(items[6])) {
                id.set(items[0] + "\t" + items[1]);
                row.set(line);
                context.write(id, row);
            }
        } else {
            System.out.println("Wrong length: " + items.length);
        }
    }
}

public static class Reduce extends Reducer<Text, Text, Text, Text> {
    private static final SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");

    static {
        format.setLenient(false);
    }

    private Text rest = new Text();

    public void reduce(Text key, Iterable<Text> values, Context context)
            throws IOException, InterruptedException {
        //  Parse row to Record
        ArrayList<Record> list = new ArrayList<Record>();
        for (Text row : values) {
            String[] items = row.toString().split("\t");
            try {
                Record record = new Record();
                record.items = items;
                record.start_time = format.parse(items[3]).getTime();
                record.stay_time = Long.parseLong(items[6]) * 1000;
                list.add(record);
            } catch (ParseException e) {
                e.printStackTrace();
            }

        }

        //  Sort
        Collections.sort(list, new Comparator<Record>() {
            @Override
            public int compare(Record r1, Record r2) {
                return (int) (r1.start_time - r2.start_time);
            }
        });

        //  Find and merge slice
        ArrayList<Record> result = new ArrayList<Record>();
        for (Record r1 : list) {
            boolean found = false;
            long r1_stop_time = r1.start_time + r1.stay_time;
            for (Record r2 : result) {
                long r2_stop_time = r2.start_time + r2.stay_time;
                if (r1.start_time > r2.start_time && r1.start_time <= r2_stop_time && r1_stop_time > r2_stop_time) {
                    //  merge the new slice
                    r2.stay_time = r1_stop_time - r2.start_time;
                    found = true;
                }
            }

            if (!found) {
                result.add(r1);
            }
        }

        //  Output
        for (Record r : result) {
            key.set(r.items[0]);

            String value = r.items[1] + "\t"
                    + r.items[2] + "\t"
                    + r.items[3] + "\t"
                    + r.items[4] + "\t"
                    + r.items[5] + "\t"
                    + (r.stay_time / 1000) + "\t"
                    + r.items[6] + "\t";
            rest.set(value);

            context.write(key, rest);
        }

    }

    static class Record {
        String[] items;
        long start_time;
        long stay_time;
    }
}

}

2 Linux脚本能力考察
2.1 请随意使用各种类型的脚本语言实现:批量将指定目录下的所有文件中的 H A D O O P H O M E HADOOP_HOME 替换成/home/ocetl/app/hadoop

2.2 假设有10台主机,H1到H10,在开启SSH互信的情况下,编写一个或多个脚本实现在所有的远程主机上执行脚本的功能
例如:runRemoteCmd.sh “ls -l”
期望结果:
H1:
XXXXXXXX
XXXXXXXX
XXXXXXXX
H2:
XXXXXXXX
XXXXXXXX
XXXXXXXX
H3:

答案
2.1 使用 find + sed 来实现:
find /home/ocetl/app/hadoop -exec sed -i ‘s/$HADOOP_HOME$//home/ocetl/app/hadoop/g’ {} ;
2.2 直接使用ssh的参数

  1. #!/bin/bash
  2. if [ $# -ne 1 ]
  3. then
  4.      echo "Usage: `basename $0` {command}"
    
  5.      exit
    
  6. fi
  7. for i in H1 H2 H3 H4 H5 H6 H7 H8 H9 H10
  8. do
  9.     echo "$i:"
    
  10.     ssh $i "$1"
    
  11. done
    复制代码

3 Hadoop基础知识与问题分析的能力
3.1 描述一下hadoop中,有哪些地方使用了缓存机制,作用分别是什么

3.2 请描述https://issues.apache.org/jira/browse/HDFS-2379说的是什么问题,最终解决的思路是什么?
3.1 不了解,HDFS用了缓存
3.2 问题是当硬盘空间很大,而内存页面缓存很少的时候,DN的Block report需要很长时间生成,而此时 FSVolumeSet 锁是锁住的,因此所有读写操作都无法执行,最终导致那些操作超时。此问题是建议提供一种方法使block report不需要持有FSVolumeSet锁,从而不会导致那些任务失败。

4 MapReduce开发能力
请参照wordcount实现一个自己的map reduce,需求为:
a 输入文件格式:
xxx,xxx,xxx,xxx,xxx,xxx,xxx
b 输出文件格式:
xxx,20
xxx,30
xxx.40
c 功能:根据命令行参数统计输入文件中指定关键字出现的次数,并展示出来
例如:hadoop jar xxxxx.jar keywordcount xxx,xxx,xxx,xxx(四个关键字)
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.io.IOException;
import java.util.ArrayList;

public class WordCount {

public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();
    private final static ArrayList<String> target_words = new ArrayList<String>();

    public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] items = value.toString().toLowerCase().replaceAll("\\p{Punct}", "").split("\\s+");
        for (String item : items) {
            if (target_words.contains(item)) {
                word.set(item);
                context.write(word, one);
            }
        }
    }

    public static void clear() {
        target_words.clear();
    }

    public static void add(String word) {
        target_words.add(word);
    }
}

public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {

    public void reduce(Text key, Iterable<IntWritable> values, Context context)
            throws IOException, InterruptedException {
        int sum = 0;
        for (IntWritable val : values) {
            sum += val.get();
        }
        context.write(key, new IntWritable(sum));
    }
}

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();

    if (args.length < 3) {
        System.out.println("Usage: wordcount <input_path> <output_path> <keyword_list>");
        return;
    }

    //  Add to target
    String[] target_words = args[2].split(",");
    for (String word : target_words) {
        Map.add(word.toLowerCase());
    }

    Job job = new Job(conf, "wordcount");

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.waitForCompletion(true);
}

}
5 MapReduce优化
请根据第五题中的程序, 提出如何优化MR程序运行速度的思路

6 Linux操作系统知识考察
请列举曾经修改过的/etc下的配置文件,并说明修改要解决的问题?
hosts:增加局域网主机名和ip对应关系,省得再记住ip;
hostname:该主机名,克隆虚拟机的时候经常需要这么做;
fstab:修改挂载点,加新硬盘的时候会需要;
profile, bash.bashrc: 修改系统范围环境变量时经常用;
network/interfaces:配置静态IP时需要。

7 Java开发能力
7.1 写代码实现1G大小的文本文件,行分隔符为\x01\x02,统计一下该文件中的总行数,要求注意边界情况的处理

7.2 请描述一下在开发中如何对上面的程序进行性能分析,对性能进行优化的过程

  1. package org.aboutyun;
  2. import java.io.BufferedReader;
  3. import java.io.FileNotFoundException;
  4. import java.io.FileReader;
  5. import java.io.IOException;
  6. public class LineCounter {
  7.  public static void main(String[] args) {
    
  8.     try {
    
  9.         BufferedReader reader = new BufferedReader(new FileReader(args[0]));
    
  10.         char[] buffer = new char[4096];
    
  11.         int count;
    
  12.         char last = 0;
    
  13.         long line_count = 0;
    
  14.         while((count = reader.read(buffer)) >= 0) {
    
  15.             if (count > 0 && line_count == 0) {
    
  16.                 //  has something in file, so at least 1 line.
    
  17.                 line_count = 1;
    
  18.             }
    
  19.             for (int i = 0; i < count ; ++i) {
    
  20.                 char c = buffer[i];
    
  21.                 if (c == 0x02) {
    
  22.                     if (i == 0 && last == 0x01) {
    
  23.                         //  buffer split the 0x01,0x02
    
  24.                         ++line_count;
    
  25.                     } else if (buffer[i-1] == 0x01) {
    
  26.                         //  normal one
    
  27.                         ++line_count;
    
  28.                     }
    
  29.                 }
    
  30.             }
    
  31.             //  keep the last one
    
  32.             last = buffer[count-1];
    
  33.         }
    
  34.         System.out.println(line_count);
    
  35.     } catch (FileNotFoundException e) {
    
  36.         e.printStackTrace();
    
  37.     } catch (IOException e) {
    
  38.         e.printStackTrace();
    
  39.     }
    
  40. }
    
  41. }
    复制代码
    7.2 可以使用Profiler来对性能进行评估分析,比如Eclipse的TPTP,或者JProfiler。可以观察不同函数调用次数和以及占用时间,从而减少调用次数,以及优化函数内部。

如有需要,可以添加博主微信,获取更多面试资料,或者向博主请教面试经验
在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/weixin_41045909/article/details/88862135