How to use spark to perform data processing on cleaned data

There are many ways to use Spark for data processing. The following is a demo of a Spark program that shows how to use Spark for data processing:

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
public class SparkDataProcessing {
    
    
    public static void main(String[] args) {
    
    
        // 创建一个SparkConf对象
        SparkConf conf = new SparkConf().setAppName("SparkDataProcessing");
        // 创建一个JavaSparkContext对象
        JavaSparkContext sc = new JavaSparkContext(conf);
        // 读取数据文件
        JavaRDD<String> lines = sc.textFile("data.txt");
        // 将数据转换为JavaRDD<Record>格式
        JavaRDD<Record> records = lines.map(
            new Function<String, Record>() {
    
    
                @Override
                public Record call(String line) throws Exception {
    
    
                    String[] fields = line.split(",");
                    String color = fields[0];
                    String size = fields[1];
                    int quantity = Integer.parseInt(fields[2]);
                    double price = Double.parseDouble(fields[3]);
                    long timestamp = Long.parseLong(fields[4]);
                    return new Record(color, size, quantity, price, timestamp);
                }
            }
        );
        // 统计每种颜色的销售量
        JavaRDD<ColorQuantity> colorQuantities = records.map(
            new Function<Record, ColorQuantity>() {
    
    
                @Override
                public ColorQuantity call(Record record) throws Exception {
    
    
                    return new ColorQuantity(record.getColor(), record.getQuantity());
                }
            }
        );
        JavaRDD<ColorQuantity> colorSalesRDD = colorQuantities.reduceByKey(
            new Function2<Integer, Integer, Integer>() {
    
    
                @Override
                public Integer call(Integer v1, Integer v2) throws Exception {
    
    
                    return v1 + v2;
                }
            }
        );
        // 统计每种尺寸的销售量
        JavaRDD<SizeQuantity> sizeQuantities = records.flatMap(
            new FlatMapFunction<Record, SizeQuantity>() {
    
    
                @Override
                public Iterator<SizeQuantity> call(Record record) throws Exception {
    
    
                    List<SizeQuantity> list = new ArrayList<>();
                    if (record.getSize() != null && !record.getSize().isEmpty()) {
    
    
                        list.add(new SizeQuantity(record.getSize(), record.getQuantity()));
                    }
                    return list.iterator();
                }
            }
        );
        JavaRDD<SizeQuantity> sizeSalesRDD = sizeQuantities.reduceByKey(
            new Function2<Integer, Integer, Integer>() {
    
    
                @Override
                public Integer call(Integer v1, Integer v2) throws Exception {
    
    
                    return v1 + v2;
                }
            }
        );
        // 计算总销售额
        double totalSales = records.map(new Function<Record, Double>() {
    
    
            @Override
            public Double call(Record record) throws Exception {
    
    
                return record.getPrice();
            }
        }).reduce(new Function2<Double, Double, Double>() {
    
    
            @Override
            public Double call(Double v1, Double v2) throws Exception {
    
    
                return v1 + v2;
            }
        });
         // 保存结果到文件
        colorSalesRDD.saveAsTextFile("color_sales");
        sizeSalesRDD.saveAsTextFile("size_sales");
        sc.parallelize(Arrays.asList(totalSales)).saveAsTextFile("total_sales");
        // 关闭JavaSparkContext对象
        sc.close();
    }
     // 定义Record类,表示一条记录
    static class Record implements Serializable {
    
    
        private static final long serialVersionUID = 1L;
        private String color;
        private String size;
        private int quantity;
        private double price;
        private long timestamp;
         public Record(String color, String size, int quantity, double price, long timestamp) {
    
    
            this.color = color;
            this.size = size;
            this.quantity = quantity;
            this.price = price;
            this.timestamp = timestamp;
        }
         public String getColor() {
    
    
            return color;
        }
         public String getSize() {
    
    
            return size;
        }
         public int getQuantity() {
    
    
            return quantity;
        }
         public double getPrice() {
    
    
            return price;
        }
         public long getTimestamp() {
    
    
            return timestamp;
        }
    }
     // 定义ColorQuantity类,表示颜色和销售量的对应关系
    static class ColorQuantity implements Serializable {
    
    
        private static final long serialVersionUID = 1L;
        private String color;
        private int quantity;
         public ColorQuantity(String color, int quantity) {
    
    
            this.color = color;
            this.quantity = quantity;
        }
         public String getColor() {
    
    
            return color;
        }
         public int getQuantity() {
    
    
            return quantity;
        }
        @Override
        public String toString() {
    
    
            return color + "," + quantity;
        }
    }
     // 定义SizeQuantity类,表示尺寸和销售量的对应关系
    static class SizeQuantity implements Serializable {
    
    
        private static final long serialVersionUID = 1L;
        private String size;
        private int quantity;
         public SizeQuantity(String size, int quantity) {
    
    
            this.size = size;
            this.quantity = quantity;
        }
         public String getSize() {
    
    
            return size;
        }
         public int getQuantity() {
    
    
            return quantity;
        }
        @Override
        public String toString() {
    
    
            return size + "," + quantity;
        }
    }
}

In this Spark program, we first JavaSparkContextcreate a Spark application using We then textFileread the data file into the RDD using the method. Next, we mapconvert each row of data into a Recordobject using the method. In Recordthe object, we store information such as color, size, sales volume, price, and timestamp. We then count the sales for each color and size using mapthe method and method, and calculate the total sales. reduceByKeyFinally, we save the result to a file.
It should be noted that in the Spark program, we use some Lambda expressions and method references in Java 8. These new features allow us to write code more concisely.

Guess you like

Origin blog.csdn.net/weixin_43031220/article/details/130655663