1 package com.demo 2 3 import org.apache.spark.sql.SQLContext 4 import org.apache.spark.{SparkConf, SparkContext} 5 6 object Country { 7 def main(args: Array[String]): Unit = { 8 println("-----------") 9 //answer1() 10 answer2() 11 answer3() 12 answer4() 13 14 } 15 16 // 北京市,500,900,300,300 定义一个基本类,包含所有的数据 17 //特别注意,最后保留了一个字段为总计,等于第一第二第三第四的合计,以便进行sql 查询 18 case class paiming(var privence: String, var firstQr: Int, var secondQr: Int, var thirdQr: Int, var fourQr: Int, var sumQr: Int, var pd: String) 19 20 /* 21 第一题思路及过程 22 1:调用资源 23 2:数据的读取 24 3:map的转化 25 4:进行排序及打印 26 */ 27 def answer1(): Unit = { 28 //资源的调度 29 var conf = new SparkConf().setAppName("Country").setMaster("local") 30 var sc = new SparkContext(conf) 31 //数据的读入 32 var initRdd = sc.textFile("D:\\IDEA_Maven\\zhoukao2\\src\\main\\resources\\moni1.txt") 33 //合计的转化过程 34 var mapRdd = initRdd.map(x => { 35 // 湖北省,600,800,300,800 36 var line = x.split(",") /*.filter(_.size<5)*/ 37 println("======: " + line) 38 var privenceName = line(0) 39 //第一季度 40 var firstQr = line(1).toInt 41 //第二季度 42 var secdQr = line(2).toInt 43 //第三季度 44 var thirdQr = line(3).toInt 45 //第四季度 46 var fourQr = line(4).toInt 47 //年度GDP 48 var sum = firstQr + secdQr + thirdQr + firstQr 49 //(省份,年GDP) 50 (privenceName, sum) 51 }) 52 //按照年度的GDP倒序排列 53 var sortRdd = mapRdd.sortBy(x => x._2, false) 54 55 //取出top5,并进行打印 56 sortRdd.take(5).foreach(println(_)) 57 //资源的释放 58 sc.stop() 59 } 60 61 /* 62 读取数据,然后按算法进行三次排序,利用自定义排序方法 63 64 */ 65 def answer2(): Unit = { 66 67 //资源的调度 68 var conf = new SparkConf().setAppName("demo").setMaster("local") 69 var sc = new SparkContext(conf) 70 //数据的读入 71 var initRdd = sc.textFile("D:\\IDEA_Maven\\zhoukao2\\src\\main\\resources\\moni1.txt") 72 //合计的转化过程 73 var mapRdd = initRdd.map(x => { 74 // 600,800,300,800 75 var line = x.split(",") 76 var privenceName = line(0) 77 var firstQr = line(1).toInt 78 var secdQr = line(2).toInt 79 var thirdQr = line(3).toInt 80 var fourQr = line(4).toInt 81 var sum = firstQr + secdQr + thirdQr + firstQr 82 //返回第一第二第三季度的数据 83 (firstQr, secdQr, thirdQr) 84 }) 85 //排序要求排序规则按照第一季度降序,第二季度升序,取第二个元素的负值、第三季度降序 86 var sortRdd = mapRdd.sortBy(x => (x._1, -x._2, x._3), false) 87 //打印结果 88 sortRdd.foreach(println(_)) 89 sc.stop() 90 91 92 } 93 94 // case when => if else if else else 需要把case when 的sql语句转换为if else 判断 95 def judge(num: Int): String = { 96 if (num < 500) { 97 "<500" 98 } 99 else if (num >= 500 && num < 1000) { 100 "<1000" 101 } 102 else if (num >= 1000 && num < 1500) { 103 "<1500" 104 } 105 else if (num >= 1500 && num < 2000) { 106 "<2000" 107 } 108 else { 109 ">2000" 110 } 111 } 112 113 def answer4(): Unit = { 114 //资源的调度 115 var conf = new SparkConf().setAppName("demo").setMaster("local") 116 var sc = new SparkContext(conf) 117 //对sparksql 的资源的调度 特别注意一定要使用val 作为隐士转换调用使用 118 val sqlT = new SQLContext(sc) 119 //数据的读入 120 var initRdd = sc.textFile("D:\\IDEA_Maven\\zhoukao2\\src\\main\\resources\\moni1.txt") 121 //合计的转化过程 122 var mapRdd = initRdd.map(x => { 123 // 600,800,300,800 124 var line = x.split(",") 125 var privenceName = line(0) 126 var firstQr = line(1).toInt 127 var secdQr = line(2).toInt 128 var thirdQr = line(3).toInt 129 var fourQr = line(4).toInt 130 var sum = firstQr + secdQr + thirdQr + firstQr 131 //通过调用第四季度得到其中的判断条件 132 var pd = judge(fourQr) 133 //返回 北京市,500,900,300,300,合计 134 paiming(privenceName, firstQr, secdQr, thirdQr, fourQr, sum, pd) 135 }) 136 //使用隐士转换的调用 137 import sqlT.implicits._ 138 // rdd 转化为dataframeRdd 139 var dataRdd = mapRdd.toDF() 140 //注册成临时表 141 dataRdd.registerTempTable("pm") 142 //A) 第一季度大于500且第二季度小于900的与省份 143 sqlT.sql(" select * from pm where firstQr > 500 and secondQr < 900 ").show() 144 //B) 第三季度大于600或者第四季度大于800的省份 145 sqlT.sql(" select * from pm where thirdQr > 600 or fourQr > 800 ").show() 146 //C) 根据第二季度分组,每组当中第三节度GDP量最大的省份 147 sqlT.sql(" select a.privence , a.secondQr , a.thirdQr from pm a , ( select secondQr , max(thirdQr) as thirdQr from pm group by secondQr ) b where a.secondQr = b.secondQr and a.thirdQr = b.thirdQr ").show() 148 /*//D) 根据第四季度降水量阶梯分组,分组办法为0-500为一组、500-100为一组、1000-1500为一组、1500-2000为一组、 149 //大于2000为一组; 求每组当中全年GDP量最高的省份 150 println("######################小于500################################") 151 sqlT.sql(" select fourQr, max(sumQr) as sumQr , max('lt500') as flag from pm where fourQr <500 group by fourQr union select fourQr, max(sumQr) as sumQr , max('lt1000') as flag from pm where fourQr >=500 and fourQr <1000 group by fourQr union select fourQr, max(sumQr) as sumQr , max('lt1500') as flag from pm where fourQr >=1000 and fourQr <1500 group by fourQr union select fourQr, max(sumQr) as sumQr , max('lt2000') as flag from pm where fourQr >=1500 and fourQr <2000 group by fourQr union select fourQr, max(sumQr) as sumQr , max('gh2000') as flag from pm where fourQr >=2000 group by fourQr ").show() 152 println("######################500 - 1000 ################################") 153 sqlT.sql(" select pm1.pd, pm.privence, pm.fourQr, pm.sumQr from ( select pd, fourQr, max(sumQr) sumQr from ( select sumQr , fourQr , case when fourQr <500 then 'ly500' when fourQr <1000 and fourQr >=500 then 'ly1000' when fourQr <1500 and fourQr >1000 then 'ly1500' when fourQr <2000 and fourQr >1500 then 'ly2000' else 'gh2000' end as pd from pm ) a group by pd ,fourQr ) pm1 join pm on pm1.fourQr = pm.fourQr and pm1.sumQr = pm.sumQr order by pm1.pd desc ").show() // 。 .rdd.repartition(1).saveAsTextFile("c:\\temp11")// 154 sqlT.sql(" select a.fourQr, a.pd , max(sumQr) as sumQr from ( select sumQr , fourQr , case when fourQr <500 then 'ly500' when fourQr <1000 and fourQr >=500 then 'ly1000' when fourQr <1500 and fourQr >1000 then 'ly1500' when fourQr <2000 and fourQr >1500 then 'ly2000' else 'gh2000' end as pd from pm ) a group by a.fourQr , a.pd ").show() 155 sqlT.sql("select a.privence , a.fourQr, a.sumQr from pm a ,( select a.fourQr, a.pd , max(sumQr) as sumQr from ( select sumQr , fourQr , case when fourQr <500 then 'ly500' when fourQr <1000 and fourQr >=500 then 'ly1000' when fourQr <1500 and fourQr >1000 then 'ly1500' when fourQr <2000 and fourQr >1500 then 'ly2000' else 'gh2000' end as pd from pm ) a group by a.fourQr , a.pd ) b where a.fourQr= b.fourQr and a.sumQr = b.sumQr ").show() 156 sqlT.sql(" select a.privence ,a.fourQr, a.sumQr from pm a ,( select fourQr , pd , max(sumQr) as sumQr from pm group by fourQr , pd) b where a.fourQr = b.fourQr and a.sumQr = b.sumQr ").show() 157 */ 158 //释放资源 159 sc.stop() 160 161 162 } 163 164 def answer3(): Unit = { 165 //资源的调度 166 var conf = new SparkConf().setAppName("demo").setMaster("local") 167 var sc = new SparkContext(conf) 168 //对sparksql 的资源的调度 特别注意一定要使用val 作为隐士转换调用使用 169 val sqlContext = new SQLContext(sc) 170 //数据的读入 171 var initRdd = sc.textFile("D:\\IDEA_Maven\\zhoukao2\\src\\main\\resources\\moni1.txt") 172 //合计的转化过程 173 // 数据的转化过程 174 var mapRdd = initRdd.map(x => { 175 var line = x.split(",") 176 //北京市,500,900,300,300 177 var city = line(0) 178 var city1 = line(1).toInt 179 var city2 = line(2).toInt 180 var city3 = line(3).toInt 181 var city4 = line(4).toInt 182 var sumGdp = city1 + city2 + city3 + city4 183 var pd = judge(city4) 184 (city, city1, city2, city3, city4, sumGdp, pd) 185 }) 186 187 import sqlContext.implicits._ 188 var dataFrameRdd = mapRdd.toDF("cs", "col1", "col2", "col3", "col4", "sumgdp", "pd") 189 dataFrameRdd.registerTempTable("t") 190 sqlContext.sql("select pd , max(sumgdp) as sumGdp from t group by pd ").show() 191 //释放资源 192 sc.stop() 193 } 194 195 }
北京市 country
猜你喜欢
转载自www.cnblogs.com/xjqi/p/12817103.html
今日推荐
周排行