如何用spark对清理好的数据进行数据处理

使用Spark进行数据处理有很多方法,下面是一个Spark程序的demo,展示如何使用Spark进行数据处理:

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
public class SparkDataProcessing {
    
    
    public static void main(String[] args) {
    
    
        // 创建一个SparkConf对象
        SparkConf conf = new SparkConf().setAppName("SparkDataProcessing");
        // 创建一个JavaSparkContext对象
        JavaSparkContext sc = new JavaSparkContext(conf);
        // 读取数据文件
        JavaRDD<String> lines = sc.textFile("data.txt");
        // 将数据转换为JavaRDD<Record>格式
        JavaRDD<Record> records = lines.map(
            new Function<String, Record>() {
    
    
                @Override
                public Record call(String line) throws Exception {
    
    
                    String[] fields = line.split(",");
                    String color = fields[0];
                    String size = fields[1];
                    int quantity = Integer.parseInt(fields[2]);
                    double price = Double.parseDouble(fields[3]);
                    long timestamp = Long.parseLong(fields[4]);
                    return new Record(color, size, quantity, price, timestamp);
                }
            }
        );
        // 统计每种颜色的销售量
        JavaRDD<ColorQuantity> colorQuantities = records.map(
            new Function<Record, ColorQuantity>() {
    
    
                @Override
                public ColorQuantity call(Record record) throws Exception {
    
    
                    return new ColorQuantity(record.getColor(), record.getQuantity());
                }
            }
        );
        JavaRDD<ColorQuantity> colorSalesRDD = colorQuantities.reduceByKey(
            new Function2<Integer, Integer, Integer>() {
    
    
                @Override
                public Integer call(Integer v1, Integer v2) throws Exception {
    
    
                    return v1 + v2;
                }
            }
        );
        // 统计每种尺寸的销售量
        JavaRDD<SizeQuantity> sizeQuantities = records.flatMap(
            new FlatMapFunction<Record, SizeQuantity>() {
    
    
                @Override
                public Iterator<SizeQuantity> call(Record record) throws Exception {
    
    
                    List<SizeQuantity> list = new ArrayList<>();
                    if (record.getSize() != null && !record.getSize().isEmpty()) {
    
    
                        list.add(new SizeQuantity(record.getSize(), record.getQuantity()));
                    }
                    return list.iterator();
                }
            }
        );
        JavaRDD<SizeQuantity> sizeSalesRDD = sizeQuantities.reduceByKey(
            new Function2<Integer, Integer, Integer>() {
    
    
                @Override
                public Integer call(Integer v1, Integer v2) throws Exception {
    
    
                    return v1 + v2;
                }
            }
        );
        // 计算总销售额
        double totalSales = records.map(new Function<Record, Double>() {
    
    
            @Override
            public Double call(Record record) throws Exception {
    
    
                return record.getPrice();
            }
        }).reduce(new Function2<Double, Double, Double>() {
    
    
            @Override
            public Double call(Double v1, Double v2) throws Exception {
    
    
                return v1 + v2;
            }
        });
         // 保存结果到文件
        colorSalesRDD.saveAsTextFile("color_sales");
        sizeSalesRDD.saveAsTextFile("size_sales");
        sc.parallelize(Arrays.asList(totalSales)).saveAsTextFile("total_sales");
        // 关闭JavaSparkContext对象
        sc.close();
    }
     // 定义Record类,表示一条记录
    static class Record implements Serializable {
    
    
        private static final long serialVersionUID = 1L;
        private String color;
        private String size;
        private int quantity;
        private double price;
        private long timestamp;
         public Record(String color, String size, int quantity, double price, long timestamp) {
    
    
            this.color = color;
            this.size = size;
            this.quantity = quantity;
            this.price = price;
            this.timestamp = timestamp;
        }
         public String getColor() {
    
    
            return color;
        }
         public String getSize() {
    
    
            return size;
        }
         public int getQuantity() {
    
    
            return quantity;
        }
         public double getPrice() {
    
    
            return price;
        }
         public long getTimestamp() {
    
    
            return timestamp;
        }
    }
     // 定义ColorQuantity类,表示颜色和销售量的对应关系
    static class ColorQuantity implements Serializable {
    
    
        private static final long serialVersionUID = 1L;
        private String color;
        private int quantity;
         public ColorQuantity(String color, int quantity) {
    
    
            this.color = color;
            this.quantity = quantity;
        }
         public String getColor() {
    
    
            return color;
        }
         public int getQuantity() {
    
    
            return quantity;
        }
        @Override
        public String toString() {
    
    
            return color + "," + quantity;
        }
    }
     // 定义SizeQuantity类,表示尺寸和销售量的对应关系
    static class SizeQuantity implements Serializable {
    
    
        private static final long serialVersionUID = 1L;
        private String size;
        private int quantity;
         public SizeQuantity(String size, int quantity) {
    
    
            this.size = size;
            this.quantity = quantity;
        }
         public String getSize() {
    
    
            return size;
        }
         public int getQuantity() {
    
    
            return quantity;
        }
        @Override
        public String toString() {
    
    
            return size + "," + quantity;
        }
    }
}

在这个Spark程序中,我们首先使用 JavaSparkContext 创建一个Spark应用程序。然后,我们使用 textFile 方法将数据文件读取到RDD中。接着,我们使用 map 方法将每一行数据转换为一个 Record 对象。在 Record 对象中,我们保存了颜色、尺寸、销售量、价格和时间戳等信息。然后,我们使用 map 方法和 reduceByKey 方法统计每种颜色和尺寸的销售量,并计算总销售额。最后,我们将结果保存到文件中。
需要注意的是,在Spark程序中,我们使用了一些Java 8中的Lambda表达式和方法引用,这些新特性可以让我们更加简洁地编写代码。

猜你喜欢

转载自blog.csdn.net/weixin_43031220/article/details/130655663