The premise is that the Spark cluster has been built. If you don't know how to build it, please refer to this link:
http://qindongliang.iteye.com/blog/2224797
Note that to submit a job, you need to use sbt to package it into a jar, and then add the jar to the main task The path of the package can be submitted remotely, and there is no need to execute the test on the remote cluster. This test uses Spark's Standalone method. The
sbt dependencies are as follows:
- name := "spark-hello"
- version := "1.0"
- scalaVersion := "2.11.7"
- //Use the company's private server
- resolvers += "Local Maven Repository" at "http://dev.bizbook-inc.com:8083/nexus/content/groups/public/"
- // use internal repository
- externalResolvers := Resolver.withDefaultResolvers(resolvers.value, mavenCentral = false)
- //Hadoop dependencies
- libraryDependencies += "org.apache.hadoop" % "hadoop-client" % "2.7.1"
- //Spark dependencies
- libraryDependencies += "org.apache.spark" % "spark-core_2.11" % "1.4.1"
- //Spark SQL dependency
- libraryDependencies += "org.apache.spark" % "spark-sql_2.11" % "1.4.1"
- //java servlet dependencies
- libraryDependencies += "javax.servlet" % "javax.servlet-api" % "3.0.1"
name := "spark-hello" version := "1.0" scalaVersion := "2.11.7" //Use the company's private server resolvers += "Local Maven Repository" at "http://dev.bizbook-inc.com:8083/nexus/content/groups/public/" // use internal repository externalResolvers := Resolver.withDefaultResolvers(resolvers.value, mavenCentral = false) //Hadoop dependencies libraryDependencies += "org.apache.hadoop" % "hadoop-client" % "2.7.1" //Spark dependencies libraryDependencies += "org.apache.spark" % "spark-core_2.11" % "1.4.1" //Spark SQL dependency libraryDependencies += "org.apache.spark" % "spark-sql_2.11" % "1.4.1" //java servlet dependencies libraryDependencies += "javax.servlet" % "javax.servlet-api" % "3.0.1"
demo1: Read HDFS data using Scala:
- /** *
- * Spark reads data from HDFS
- */
- ef readDataFromHDFS(): Unit ={
- //Run in standalone mode and submit to the remote spark cluster
- val conf = new SparkConf().setMaster("spark://h1:7077").setAppName("load hdfs data")
- conf.setJars(Seq(jarPaths));
- // get a Sprak context
- val sc = new SparkContext(conf)
- val textFile=sc.textFile("hdfs://h1:8020/user/webmaster/crawldb/etl_monitor/part-m-00000")
- //get the first data
- //val data=textFile.first()
- // println(data)
- // traverse and print
- /**
- * collect() method The cursor method iteratively collects each row of data
- * take(5) Take the first topN data
- * foreach() iterative print
- * stop() closes the link
- */
- textFile.collect().take(5).foreach( line => println(line) )
- //close the resource
- sc.stop()
/** * * Spark reads data from HDFS */ def readDataFromHDFS(): Unit ={ //Run in standalone mode and submit to the remote spark cluster val conf = new SparkConf().setMaster("spark://h1:7077").setAppName("load hdfs data") conf.setJars(Seq(jarPaths)); // get a Sprak context val sc = new SparkContext(conf) val textFile=sc.textFile("hdfs://h1:8020/user/webmaster/crawldb/etl_monitor/part-m-00000") //get the first data //val data=textFile.first() // println(data) // traverse and print /** * collect() method iteratively collects each row of data in cursor mode * take(5) Take the first topN data * foreach() iterative print * stop() closes the link */ textFile.collect().take(5).foreach( line => println(line) ) //close the resource sc.stop() }
demo2: Use Scala to create data on the client and test Spark Sql:
- def mappingLocalSQL1() {
- val conf = new SparkConf().setMaster("spark://h1:7077").setAppName("hdfs data count")
- conf.setJars(Seq(jarPaths));
- val sc = new SparkContext(conf)
- val sqlContext=new SQLContext(sc);
- //Import implicit sql schema conversion
- import sqlContext.implicits._
- val df = sc.parallelize((1 to 100).map(i => Record(i, s"val_$i"))).toDF()
- df.registerTempTable("records")
- println("Result of SELECT *:")
- sqlContext.sql("SELECT * FROM records").collect().foreach(println)
- //aggregate query
- val count = sqlContext.sql("SELECT COUNT(*) FROM records").collect().head.getLong(0)
- println(s"COUNT(*): $count")
- sc.stop()
- }
def mappingLocalSQL1() { val conf = new SparkConf().setMaster("spark://h1:7077").setAppName("hdfs data count") conf.setJars(Seq(jarPaths)); val sc = new SparkContext(conf) val sqlContext=new SQLContext(sc); //Import implicit sql schema conversion import sqlContext.implicits._ val df = sc.parallelize((1 to 100).map(i => Record(i, s"val_$i"))).toDF() df.registerTempTable("records") println("Result of SELECT *:") sqlContext.sql("SELECT * FROM records").collect().foreach(println) //aggregate query val count = sqlContext.sql("SELECT COUNT(*) FROM records").collect().head.getLong(0) println(s"COUNT(*): $count") sc.stop() }
Spark SQL reads HDFS methods and fields by mapping entity classes. Note that there is a case class definition at the top of Scala's Objcet. It must be placed
here , otherwise problems will occur:
demo2: Use Scala to remotely read HDFS files and map them Create a Spark table and read top10 in Spark Sql mode:
- val jarPaths="target/scala-2.11/spark-hello_2.11-1.0.jar"
- /**The way Spark SQL maps to entity classes**/
- def mapSQL2(): Unit ={
- //Use a class, the parameters are all optional types, if there is no value, the default is NULL
- //SparkConf specifies the master and task name
- val conf = new SparkConf().setMaster("spark://h1:7077").setAppName("spark sql query hdfs file")
- / / Set the upload requires jar package
- conf.setJars(Seq(jarPaths));
- //Get Spark context
- val sc = new SparkContext(conf)
- // get SQL context
- val sqlContext=new SQLContext(sc);
- //This line of code must be imported to be implicitly converted to a table
- import sqlContext.implicits._
- //Read a file on hdfs and split it into an array according to a certain delimiter
- //Then map to the corresponding field value according to the length, and deal with the problem of array out-of-bounds
- val model=sc.textFile("hdfs://h1:8020/user/webmaster/crawldb/etl_monitor/part-m-00000").map(_.split("\1"))
- .map( p => ( if (p.length==4) Model(Some(p(0)), Some(p(1)), Some(p(2)), Some(p(3).toLong))
- elseif (p.length==3) Model(Some(p(0)), Some(p(1)), Some(p(2)),None)
- elseif (p.length==2) Model(Some(p(0)), Some(p(1)),None,None)
- else Model( Some(p(0)),None,None,None )
- )).toDF() //Convert to DF
- //register temporary table
- model.registerTempTable("monitor")
- //Execute sql query
- val it = sqlContext.sql("SELECT rowkey,title,dtime FROM monitor limit 10 ")
- // val it = sqlContext.sql("SELECT rowkey,title,dtime FROM monitor WHERE title IS NULL AND dtime IS NOT NULL ")
- println( "start" )
- it.collect().take(8).foreach(line => println(line))
- println( "end" )
- sc.stop();
- }
val jarPaths="target/scala-2.11/spark-hello_2.11-1.0.jar" /**The way Spark SQL maps to entity classes**/ def mapSQL2(): Unit ={ //Use a class, the parameters are all optional types, if there is no value, the default is NULL //SparkConf specifies the master and task name val conf = new SparkConf().setMaster("spark://h1:7077").setAppName("spark sql query hdfs file") / / Set the upload requires jar package conf.setJars(Seq(jarPaths)); //Get Spark context val sc = new SparkContext(conf) // get SQL context val sqlContext=new SQLContext(sc); //This line of code must be imported to be implicitly converted to a table import sqlContext.implicits._ //Read a file on hdfs and split it into an array according to a certain delimiter //Then map to the corresponding field value according to the length, and deal with the problem of array out-of-bounds val model=sc.textFile("hdfs://h1:8020/user/webmaster/crawldb/etl_monitor/part-m-00000").map(_.split("\1")) .map( p => ( if (p.length==4) Model(Some(p(0)), Some(p(1)), Some(p(2)), Some(p(3).toLong)) else if (p.length==3) Model(Some(p(0)), Some(p(1)), Some(p(2)),None) else if (p.length==2) Model(Some(p(0)), Some(p(1)),None,None) else Model( Some(p(0)),None,None,None ) )).toDF()//Convert to DF //register temporary table model.registerTempTable("monitor") //Execute sql query val it = sqlContext.sql("SELECT rowkey,title,dtime FROM monitor limit 10 ") // val it = sqlContext.sql("SELECT rowkey,title,dtime FROM monitor WHERE title IS NULL AND dtime IS NOT NULL ") println("start") it.collect().take(8).foreach(line => println(line)) println("end") sc.stop(); }
In IDEA's console, the following results can be output: