1 package spark_example01; 2 3 4 import java.io.File; 5 import java.io.FileWriter; 6 import java.io.IOException; 7 import java.util.Random; 8 9 /** 10 */ 11 public class PeopleInfoFileGenerator { 12 public static void main(String[] args){ 13 File file = new File("/Users/xls/Desktop/code/bigdata/data/PeopleInfo.txt"); 14 15 try { 16 the Random Random = new new the Random (); // generates a random number . 17 FileWriter FileWriter = new new FileWriter (File); // Create a new file 18 is for ( Long I =. 1; I <= 100000000; I ++) { // generate 10 million digits . 19 int height = random.nextInt (220 ); 20 is IF (height <50 ) { 21 is height = height + 50 ; 22 is } 23 is String getRandomGender gender = (); // gender method 24 IF (height < 100 && gender == "M") { 25 height = height + 100; 26 } 27 if (height < 100 && gender == "F") { 28 height = height + 40; 29 } 30 fileWriter.write( i + " " + getRandomGender() + " " + height); //文件格式:ID 性别 身高 31 fileWriter.write(System.getProperty("line.separator")); 32 } 33 fileWriter.flush(); 34 fileWriter.close(); 35 System.out.println ( "People Information File Generated successfully." ); 36 } the catch (IOException E) { 37 [ e.printStackTrace (); 38 is } 39 } 40 41 is public static String getRandomGender () { // build a randomly generated gender method 42 is the Random Random = new new the Random (); 43 is int randomNum random.nextInt = (2) +. 1 ; 44 is IF (randomNum% 2 == 0 ) { 45 return "M" ; 46 is }else{ 47 return "F"; 48 } 49 } 50 }
1 package spark_example01; 2 3 import org.apache.spark.SparkConf; 4 import org.apache.spark.api.java.JavaRDD; 5 import org.apache.spark.api.java.JavaSparkContext; 6 import org.apache.spark.api.java.function.FlatMapFunction; 7 import org.apache.spark.api.java.function.Function; 8 import java.util.Arrays; 9 /** 10 * Created by Administrator on 2017/11/17. 11 */ 12 public class PeopleInfoCalculator { 13 public static void main(String[] args){ 14 15 SparkConf sparkConf = new SparkConf().setAppName("PeopleInfoCalculator").setMaster("local[3]"); 16 17 JavaSparkContext sc = new JavaSparkContext(sparkConf); 18 19 JavaRDD<String> dataFile = sc.textFile("/Users/xls/Desktop/code/bigdata/data/PeopleInfo.txt"); 20 21 // step1:过滤出性别为M的数据 22 JavaRDD<String> maleFilterData = dataFile.filter(new Function<String, Boolean>() { 23 24 public Boolean call(String s) throws Exception { 25 return s.contains("M"); 26 } 27 }); 28 29 //step2:过滤出性别为F的数据 30 JavaRDD<String> femaleFilterData = dataFile.filter(new Function<String, Boolean>() { 31 32 public Boolean call(String s) throws Exception { 33 return s.contains("F"); 34 } 35 }); 36 37 //setp3: gender obtain the height data string of M --- be segmented for each row, and finally get [2] used in the string --- flatMap string segmentation 38 is JavaRDD <String> maleHeightData = maleFilterData.flatMap ( new new FlatMapFunction <String, String> () { 39 @Override 40 public the java.util.Iterator <String> Call (String S) throws Exception { 41 is return Arrays.asList (s.split ( "") [2 ]). Iterator (); 42 is } 43 is }); 44 is 45 // Step4: the height data is obtained gender F --- string of each line is segmented, and finally get [2] used in the string --- flatMap segmentation string 46 is JavaRDD <string> femaleHeightData femaleFilterData.flatMap = (new new FlatMapFunction <String, String> () { 47 @Override 48 public the java.util.Iterator <String> Call (String S) throws Exception { 49 return Arrays.asList (s.split ( "") [2 ]). Iterator (); 50 } 51 is }); 52 is 53 is // STEP5: male height of the transformed format to integer format string 54 is JavaRDD <integer> = maleHeightDataInt maleHeightData.map ( new new Function <string, integer> () { // 55 @Override 56 is public Integer Call (String S) throws{Exception 57 is return the Integer.parseInt (String.valueOf (S)); 58 } 59 }); 60 61 is // Step6: converting the format string female height integer format 62 is JavaRDD <Integer> = femaleHeightData.map femaleHeightDataInt ( new new Function <string, integer> () { // convert string format to integer format 63 is @Override 64 public integer Call (string S) throws Exception { 65 return the Integer.parseInt (String.valueOf (S)); 66 } 67 }); 68 69 // the sortBy (<T>, Ascending, numPartitions) Explanation: 70 // first parameter is a function that also has a generic parameter T belt, return type and RDD type elements are consistent; 71 // second parameter is ascending, which parameters determine the elements in the sorted RDD is ascending or descending, the default is true, that is, in ascending order; 72 // third parameter is numPartitions, this parameter determines the sorted partition RDD number, the number of partitions and the number before ordering the default sort equal, i.e. this.partitions.size. 73 is 74 // STEP7: pressing the male height from lowest to highest --- represents a true default sorting parameter, sorted in ascending order, from low to high exhaust 75 JavaRDD <Integer> = maleHeightLowSort maleHeightDataInt.sortBy ( new new Function <Integer , Integer> () { 76 public Integer Call (Integer S) throws Exception { 77 return S; 78 } 79 }, true ,. 3 ); 80 81 // Step8: female pressing height from lowest to highest --- represents a true default sorting parameter, sorted in ascending order, from low to high exhaust 82 JavaRDD <Integer> femaleHeightLowSort femaleHeightDataInt.sortBy = ( new new Function <Integer, Integer> () { 83 public Integer Call (Integer S) throws Exception { 84 return S; 85 } 86 }, to true ,. 3 ); 87 88 // Step9: male press height the descending sort --- false represented as descending order, from high to low 89 JavaRDD <Integer> = maleHeightHightSort maleHeightDataInt.sortBy ( new new Function <Integer, Integer> () { 90 public Integer Call (Integer S) throws Exception { 91 is return S; 92 } 93 }, to false ,. 3 ); 94 95 // Step10 : height of the female sort descending --- pressing to false as represented in descending order from high to low 96 JavaRDD <Integer> = femaleHeightHightSort femaleHeightDataInt.sortBy ( new new Function <Integer, Integer> () { 97 public Integer Call (Integer S) throws Exception { 98 return S; 99 } 100 }, to false ,. 3 ); 101 102 Integer lowestMale maleHeightLowSort.first = (); // first number determined in ascending order, i.e., the minimum value of 103 Integer lowestFemale femaleHeightLowSort.first = (); / / first number determined in ascending order, i.e., the minimum value of 104 Integer highestMale maleHeightHightSort.first = (); // first number determined in descending order, i.e., the maximum value of 105 Integer highestFemale femaleHeightHightSort.first = (); // find descending the first number, i.e., the maximum value of 106 107 System.out.println ( "number the peole of for Woman:" + femaleHeightData.count ());// total number of women obtaining 108 System.out.println ( "Number The peole of for a Man:" + maleHeightData.count ()); // total number of males is determined 109 System.out.println ( "Lowest for a Man : "+ lowestMale); // find the shortest male height 110 System.out.println (" Lowest female: "+ lowestFemale); // find women's shortest height 111 System.out.println (" Highest male: " highestMale +); // determine the maximum height male 112 System.out.println ( "highest for woman:" + highestFemale); // determine the maximum height F 113 114 } 115 } 1 16 117 / * 1 18 . * A case is described 119 This case Suppose we need statistical population (100,000) a province of gender as well as height, need to calculate the number of men and women, men of the highest and lowest height, as well as women in the highest and lowest height. 120 in this case used in the source file has the following format, three are ID, gender, height (cm), the following format: 121 . B generate demographic data 122 using a randomly generated set of Java language demographic data, including sequence ID gender M / F, height cm, code is as follows: 123 c examples of process analysis. 124 for this case, we have to the statistics for men and women, respectively, then it is natural to think first of all need to be from the corresponding RDD source file for the men and women information separation, it will produce two new RDD, each containing information for men and women; 125 followed by men and women, respectively, corresponding to the information data RDD further mapping to include only the height data, so we get two RDD, respectively male height and female height; 126 finally need to sort these two RDD, and then get the highest and lowest male or female height. 127 first step, the separation of men and women information, using the filter operator filter criteria include "M" line of men, includes "F" line are women; 128 second step we need to map the operator's height data of each sex separated from the RDD; 129 third step we need to couple the height sortBy operator sort the data. 130 Special Note: need to convert the data into an integer Height RDD conversion process, the operator would otherwise sortBy it as a string, then sort the results will be affected, 131 such as height data if it is: 123,110,84,72,100, then the results will be sorted in ascending order 100,110,123,72,84, obviously this is not right. 132 . Height D determined to achieve statistical code: 133 * * /