版本1
要点:因为每一行tab键分割后,数组大小固定,过滤掉垃圾数据
package sql;
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.log4j.Logger;
import org.apache.log4j.Level;
public class SparkSqlCluster{
public static void main(String[] args) {
if(args.length < 2){
System.err.println("Usage: SparkSqlCluster <time> <ip>");
System.exit(1);
}
Logger.getRootLogger().setLevel(Level.WARN);
SparkSession spark = SparkSession
.builder()
.master("spark://master106:7077")
.appName("Java Spark SQL basic example")
.config("spark.some.config.option", "some-value")
.getOrCreate();
// $example off:init_session$
//
long startTime = System.currentTimeMillis();
runProgrammaticSchema(spark,args[0],args[1]);
long stopTime = System.currentTimeMillis();
System.out.println("run time: " + (stopTime - startTime)/1000 + " s");
System.out.println("传递参数: " + args[0] +" : " + args[1]);
spark.stop();
}
private static void runProgrammaticSchema(SparkSession spark,String time,String ip) {
// Create an RDD
JavaRDD<String> peopleRDD = spark.sparkContext()
// .textFile("hdfs://master106/spark_log",1)
.textFile("hdfs://master106/data/sni_test/*",1)
.toJavaRDD()
.filter(new Function<String, Boolean>() {
@Override
public Boolean call(String v1) throws Exception {
String[] v = v1.split("\t");
return v.length == 21;
}
});
long startTime = System.currentTimeMillis();
// The schema is encoded in a string
// 2,3 -> time
// 4,10,15 -> ip
String schemaString = "k1,k2,k3,k4,k5,k6,k7,k8,k9,k10,k11,k12,k13,k14,k15,k16,k17,k18,k19,k20";
// Generate the schema based on the string of schema
List<StructField> fields = new ArrayList<>();
for (String fieldName : schemaString.split(",")) {
StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
fields.add(field);
}
StructType schema = DataTypes.createStructType(fields);
// Convert records of the RDD (people) to Rows
JavaRDD<Row> rowRDD = peopleRDD.map(new Function<String, Row>() {
@Override
public Row call(String record) throws Exception {
String[] v = record.split("\t");
System.out.println(v.length );
return RowFactory.create(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15], v[16], v[17], v[18],v[19]);
}
});
// Apply the schema to the RDD
Dataset<Row> peopleDataFrame = spark.createDataFrame(rowRDD, schema);
// Creates a temporary view using the DataFrame
peopleDataFrame.createOrReplaceTempView("people");
// SQL can be run over a temporary view created using DataFrames
// (k2 or k3 ) and (k4 or k10 k15)
Dataset<Row> results = spark.sql("SELECT * FROM people where k2 like '%"+time+"' and k10 like '%"+ip+"'");
//Dataset<Row> results = spark.sql("SELECT * FROM people where k2 like '%2019-03-01%' and k10 like '%210.40.16.90%'");
// 字符串匹配 k like '%fdsfs%'
//results.show(false);
// The results of SQL queries are DataFrames and support all the normal RDD operations
// The columns of a row in the result can be accessed by field index or by field name
Dataset<String> namesDS = results.map(new MapFunction<Row, String>() {
@Override
public String call(Row row) throws Exception {
return row.mkString();
}
}, Encoders.STRING());
namesDS.show(false);
}
}
版本2
添加了累加器,可以知道一共多少行数据,过滤掉多少行,保留多少行
package sql;
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.log4j.Logger;
import org.apache.log4j.Level;
import org.apache.spark.util.LongAccumulator;
public class SparkSqlCluster{
public static void main(String[] args) {
if(args.length < 3){
System.err.println("Usage: SparkSqlCluster <file> <time> <ip>");
System.exit(1);
}
Logger.getRootLogger().setLevel(Level.WARN);
SparkSession spark = SparkSession
.builder()
.master("spark://master106:7077")
.appName("Spark SQL")
.config("spark.some.config.option", "some-value")
.getOrCreate();
// $example off:init_session$
// 创建三个累加器,用来文件总行数与
LongAccumulator accumSum = spark.sparkContext().longAccumulator("accumSum");
LongAccumulator accumReserved = spark.sparkContext().longAccumulator("Reserved");
LongAccumulator accumFiltered = spark.sparkContext().longAccumulator("Filtered");
long startTime = System.currentTimeMillis();
runProgrammaticSchema(spark,accumSum,accumFiltered,accumReserved,args[0],args[1],args[2]);
long stopTime = System.currentTimeMillis();
System.out.println("run time: " + (stopTime - startTime)/1000 + " s");
System.out.println();
System.out.println("参数path:" + args[0]);
System.out.println("参数k1:" + args[1]);
System.out.println("参数k4:" + args[2]);
spark.stop();
}
private static void runProgrammaticSchema(SparkSession spark,LongAccumulator accumSum,LongAccumulator accumFiltered,LongAccumulator accumReserved,String path,String time,String ip) {
// Create an RDD
JavaRDD<String> LogRDD = spark.sparkContext()
// .textFile("hdfs://master106/spark_log",1)
.textFile(path,2)
.toJavaRDD()
.filter(new Function<String, Boolean>() {
@Override
public Boolean call(String v1) throws Exception {
// 记录总行数
accumSum.add(1);
String[] v = v1.split("\t");
if(v.length != 21){
accumFiltered.add(1);
System.out.println(v1);
}
return v.length == 21;
}
}).map(new Function<String, String>() {
@Override
public String call(String v1) throws Exception {
// 记录过滤后保留剩下来的
accumReserved.add(1);
return v1;
}
});
long startTime = System.currentTimeMillis();
String schemaString = "k1,k2,k3,k4,k5,k6,k7,k8,k9,k10,k11,k12,k13,k14,k15,k16,k17,k18,k19,k20";
// Generate the schema based on the string of schema
List<StructField> fields = new ArrayList<>();
for (String fieldName : schemaString.split(",")) {
StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
fields.add(field);
}
StructType schema = DataTypes.createStructType(fields);
// Convert records of the RDD (people) to Rows
JavaRDD<Row> rowRDD = LogRDD.map(new Function<String, Row>() {
@Override
public Row call(String record) throws Exception {
String[] v = record.split("\t");
System.out.println(v.length );
return RowFactory.create(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], v[8], v[9],
v[10], v[11], v[12], v[13], v[14], v[15], v[16], v[17], v[18],v[19]);
}
});
// Apply the schema to the RDD
Dataset<Row> LogDataFrame = spark.createDataFrame(rowRDD, schema);
// Creates a temporary view using the DataFrame
LogDataFrame.createOrReplaceTempView("people");
// SQL can be run over a temporary view created using DataFrames
// (k2 or k3 ) and (k4 or k10 k15)
Dataset<Row> results = spark.sql("SELECT * FROM people where k2 = '"+time+"' and k4 = '"+ip+"'");
//Dataset<Row> results = spark.sql("SELECT * FROM people where k2 like '%2019-03-01%' and k10 like '%210.40.16.90%'");
// 字符串匹配 k like '%fdsfs%'
//results.show(false);
// The results of SQL queries are DataFrames and support all the normal RDD operations
// The columns of a row in the result can be accessed by field index or by field name
Dataset<String> namesDS = results.map(new MapFunction<Row, String>() {
@Override
public String call(Row row) throws Exception {
return row.mkString();
}
}, Encoders.STRING());
//System.out.println(namesDS.collect().toString());
namesDS.show(false);
System.out.println("sum: " + accumSum.value());
System.out.println("Reserved: " + accumReserved.value());
System.out.println("filtered: " + accumFiltered.value());
}
}
版本3
增加了持久化
package sql;
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.log4j.Logger;
import org.apache.log4j.Level;
import org.apache.spark.storage.StorageLevel;
import org.apache.spark.util.LongAccumulator;
public class SparkSqlCluster{
public static void main(String[] args) {
// if(args.length < 3){
// System.err.println("Usage: SparkSqlCluster <file> <time> <ip>");
// System.exit(1);
// }
Logger.getRootLogger().setLevel(Level.WARN);
SparkSession spark = SparkSession
.builder()
.master("spark://master106:7077")
.appName("Spark SQL")
.config("spark.some.config.option", "some-value")
.getOrCreate();
// $example off:init_session$
// 创建三个累加器,用来文件总行数与
LongAccumulator accumSum = spark.sparkContext().longAccumulator("accumSum");
LongAccumulator accumReserved = spark.sparkContext().longAccumulator("Reserved");
LongAccumulator accumFiltered = spark.sparkContext().longAccumulator("Filtered");
runProgrammaticSchema(spark,accumSum,accumFiltered,accumReserved);
// System.out.println("参数path:" + args[0]);
// System.out.println("参数k1:" + args[1]);
// System.out.println("参数k4:" + args[2]);
spark.stop();
}
private static void runProgrammaticSchema(SparkSession spark,LongAccumulator accumSum,LongAccumulator accumFiltered,LongAccumulator accumReserved) {
// Create an RDD
JavaRDD<String> LogRDD = spark.sparkContext()
.textFile("hdfs://master106/data/*",2)
// .textFile(path,2)
.toJavaRDD()
.filter(new Function<String, Boolean>() {
@Override
public Boolean call(String v1) throws Exception {
// 记录总行数
accumSum.add(1);
String[] v = v1.split("\t");
if(v.length != 21){
accumFiltered.add(1);
System.out.println(v1);
}
return v.length == 21;
}
}).map(new Function<String, String>() {
@Override
public String call(String v1) throws Exception {
// 记录过滤后保留剩下来的
accumReserved.add(1);
return v1;
}
});
String schemaString = "k1,k2,k3,k4,k5,k6,k7,k8,k9,k10,k11,k12,k13,k14,k15,k16,k17,k18,k19,k20,k21";
// Generate the schema based on the string of schema
List<StructField> fields = new ArrayList<>();
for (String fieldName : schemaString.split(",")) {
StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
fields.add(field);
}
StructType schema = DataTypes.createStructType(fields);
// Convert records of the RDD (logRecord) to Rows
JavaRDD<Row> rowRDD = LogRDD.map(new Function<String, Row>() {
@Override
public Row call(String record) throws Exception {
String[] v = record.split("\t");
System.out.println(v.length );
return RowFactory.create(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], v[8], v[9],
v[10], v[11], v[12], v[13], v[14], v[15], v[16], v[17], v[18],v[19],v[20]);
}
}).persist(StorageLevel.MEMORY_AND_DISK_SER());
// Apply the schema to the RDD
Dataset<Row> LogDataFrame = spark.createDataFrame(rowRDD, schema);
// Creates a temporary view using the DataFrame
LogDataFrame.createOrReplaceTempView("sniData");
// 持久化后第一次查询
long startTime = System.currentTimeMillis();
Dataset<Row> result1 = spark.sql("SELECT * FROM sniData where k2 = '2019-03-01 15:12:01' and k21 like '%xvideos%'");
//Dataset<Row> results1 = spark.sql("SELECT * FROM people where k21 like '%xvideos%'");
//results.select("k2","k21").show(false);
result1.show(10000);
long stopTime1 = System.currentTimeMillis();
System.out.println("sum: " + accumSum.value());
System.out.println("Reserved: " + accumReserved.value());
System.out.println("filtered: " + accumFiltered.value());
System.out.println("===================================");
// 持久化后第二次查询
Dataset<Row> result2 = spark.sql("SELECT * FROM sniData where k2 = '2019-03-01 15:12:01' and k21 like '%xvideos%'");
result2.show(10000);
long stopTime2 = System.currentTimeMillis();
System.out.println("第1次查询: " + (stopTime1 - startTime)/1000 + " s");
System.out.println("第2次查询: " + (stopTime2 - stopTime1)/1000 + " s");
System.out.println("sum: " + accumSum.value());
System.out.println("Reserved: " + accumReserved.value());
System.out.println("filtered: " + accumFiltered.value());
}
}