use idea:
- Create a maven project using idea
Add scala to Project Structure (the environment must be set up, otherwise it will not work)
- Add in pom.xml (you can use Alt+insert and then select Dependency to add the required dependencies)
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>untitled</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.4.8</version>
</dependency>
</dependencies>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
</properties>
</project>
- Scala code completion method: https://blog.csdn.net/qq_44065303/article/details/108345728
Case number one:
- Count word (case insensitive)
Word Count (word frequency statistics) is to count the number of times words appear in one or more files.
import org.apache.spark.{
SparkConf, SparkContext}
object word {
def main(args:Array[String]): Unit ={
val conf=new SparkConf().setMaster("local").setAppName("word")
val sc=new SparkContext(conf)
val t=sc.textFile("hdfs://localhost:9000/wordcount/word.txt")//文件在hdfs的path(路径)
val t1=t.filter(_.trim.length>0).map(i=>i.toUpperCase())//除去空格,全部转换成大写
val t2=t1.flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_,1).sortBy(_._2,false).collect().foreach(println)//key不变的情况下,使value增加
sc.stop()
}
}
Map reference: https://blog.csdn.net/xianpanjia4616/article/details/80947616
Case two:
- Calculate the average age of all people in a population of 1 million
Need to use Scala to write a file that generates 1 million population age data
import java.io.FileWriter
import java.io.File
import scala.util.Random
object rry{
def main(args:Array[String]) {
val writer = new FileWriter(new File("/home/hadoop/data.txt"),false)//文件保存路径
val rand = new Random()
for ( i <- 1 to 1000000) {
writer.write( i + " " + rand.nextInt(100))
writer.write(System.getProperty("line.separator"))
}
writer.flush()
writer.close()
}
}
Data upload to hdfs
打开终端
打开hadoop
cd /usr/local/hadoop//hadoop安装路径
./sbin/start-all.sh
//上传
./bin/hdfs dfs -put /home/hadoop/data.txt /wordcount(后面是hdfs上的目录)
- programming
import org.apache.spark.{
SparkConf, SparkContext}
object scala1000{
def main(args:Array[String]) {
val conf = new SparkConf().setAppName("10000r").setMaster("local")
val sc = new SparkContext(conf)
val t = sc.textFile("file:///home/hadoop/data.txt")//也可以使用("hdfs://localhost:9000/wordcount/data.txt")
val count = t.count()//记录人数
val age = t.map(i => i.split(" ")(1)).map(j => j.trim.toInt).collect().reduce((a, b) => a + b)//trim是去除左右的空格toInt比转换成int类型
val ct=age.toDouble/count.toDouble//平均年龄
println(age+" "+count+" " +ct)
}
}
Case three:
- Population (10,000) gender and height for statistics
Generate this file with the following Scala program
import java.io.FileWriter
import java.io.File
import scala.util.Random
object sheng {
def main(args:Array[String]) {
val writer = new FileWriter(new File("/home/hadoop/sheng.txt"),false)
val rand = new Random()
for ( i <- 1 to 10000) {
var height = rand.nextInt(220)
if (height < 50) {
height = height + 50
}
var gender = getRandomGender
if (height < 100 && gender == "M")
height = height + 100
if (height < 100 && gender == "F")
height = height + 50
writer.write( i + " " + getRandomGender + " " + height)
writer.write(System.getProperty("line.separator"))
}
writer.flush()
writer.close()
println("People Information File generated successfully.")
}
def getRandomGender() :String = {
val rand = new Random()
val randNum = rand.nextInt(2) + 1
if (randNum % 2 == 0) {
"M"
} else {
"F"
}
}
The generated text is as follows:
1 M 177
2 F 210
3 M 193
4 M 220
......
- programming
First save the data of men and women separately, and then calculate the maximum value separately
import org.apache.spark.{
SparkConf, SparkContext}
object sheng {
def main(args:Array[String]) {
val conf = new SparkConf().setMaster("local").setAppName("shengg")
val sc = new SparkContext(conf)
val t = sc.textFile("file:///home/hadoop/sheng.txt")
//t.filter(i=>i.contains("M")).map(i=>i.split(" ")(1)+" "+i.split(" ")(2)).collect().foreach(println)
val t1 = t.filter(i => i.contains("M")).map(i => i.split(" ")(1) + " " + i.split(" ")(2))
val t2 = t.filter(i => i.contains("M")).map(i => i.split(" ")(1) + " " + i.split(" ")(2))
val t3 = t1.map(i => i.split(" ")(1).toInt).sortBy(i => i)
println("M:min " + t3.first())
val t4 = t1.map(i => i.split(" ")(1).toInt).sortBy(i => i, false)
println("M:max " + t4.first())
val s1 = t.filter(i => i.contains("F")).map(i => i.split(" ")(1) + " " + i.split(" ")(2))
val s2 = t.filter(i => i.contains("F")).map(i => i.split(" ")(1) + " " + i.split(" ")(2))
val s3 = s1.map(i => i.split(" ")(1).toInt).sortBy(i => i)
println("F:min " + s3.first())
val s4 = s2.map(i => i.split(" ")(1).toInt).sortBy(i => i, false)
println("F:max " + s4.first())
sc.stop()
}
}