Mapreduce框架
输入 user1 movie1 rating
mapreduce 1阶段
map(<user>,<movie>,<rating>){
k2=movie;
v2=Tuple2(user, rating);
emit(ke,v2)
}
//key =movie
// values=List<Tuple2<user,rating>>
reduce(key,values){
numberofraters=values.size();
for(Tuple2<user,rating> t2:values){
k3=t2.user;
v3=Tuple3(key,,t2.rating,numberofraters);
emit(k3,v3);
}
}
mapreduce 第二阶段
//key=user
//value=movie,rating,numberofraters
map(key, value){
Sting [] tokens=value.split(",");
movie=tokens[0];
rating=tokens[1];
numberofraters=tokens[2];
Tuple3<String,Integer,Integer> v3=new Tuple3<movie,rating,numberofraters>;
emit(key,v3)
}
//key=user
//values=list(Tuple3(movie,rating,numberofraters))
reduce(key,values){
List(Tuple2(Tuple3<movie1>,<rating1>,<numberofraters1>,Tuple3<movie2>,<rating2>,<numberofraters2>)) list=
generateUniqueCombinations(values);
for(Tuple2(Tuple3<movie1>,<rating1>,<numberofraters1>,Tuple3<movie2>,<rating2>,<numberofraters2>) pair :list){
m1=pair._1;m2=pair._2;
reducekey=Tuple2(m1.movie,m2.movie);
int ratingproduct=m1.rating*m2.rating;
int rating1squared= m1.rating*m1.rating;
int rating2squared=m2.rating*m2.rating;
reducevalue=Tupl7(m1.rating,m1.numberofratins,m2.rating,m2.numberofraters,ratingproduct,rating1squared,rating2squred);
emit(reducekey,reducevalue);
}
mapreduce阶段3
map(key,value){emit(key,value)}
reduce(key, values){
int groupsize=value.size();
int dotproduct=0;
int rating1sum=0;
int rating2sum=0;
int rating1norsq=0;
int rating2norsq=0;
int maxnumofrater1=0;
int maxnumofrater2=0;
for(Tuple7(...) :values){
dotproduct +=ratingprod;
rating1sum +=rating1;
rating2sum +=rating2;
rating1norsq +=rating1squared;
rating2norsq +=rating2squared;
if(numofraters1>maxnumofraters1){maxnumofraters1=numofraters1;}
if(numofraters2>maxnumofraters1){maxnumofraters12=numofraters2;}
}
double person=calculatepersonCorrelation(groupsize,dotproduct,rating1sum,rating2sum,rating1norsq,rating2norsq);
double cosine=calculatecosineCorrelation(groupsize,dotproduct,rating1sum,rating2sum,rating1norsq,rating2norsq);
double jaccard=calculatejaccardCorrelation(groupsize,dotproduct,rating1sum,rating2sum,rating1norsq,rating2norsq);
return Tuple3(person,cosine,jaccard);
}
}
spark 电影推荐实现
public class MovieRecommendatonsWithJoin{
public static void main(String[] args) throws Exception{
//处理输入参数
//创建spark上下文对象
JavaSparkRDD ctx = new JavaSparkContext();
//读取hdfs文件创建RDD
JavaRDD<String> userRatings=ctx.textFile(inputfile,1);
//找出谁曾对这个电影评分
JavaPairRDD<String,Tuple2<String,Integer>> movieRDD= userRatings.mapToPair(
new PairFunction<String,String,Tuple2<String,Integer>>(){
public Tuple2<String,Tuple2<String,Integer>>call(String s){
String[] record=s.split(" ");
String user = record[0];
String movie= record[1];
Integer rating= new Integer(record[2]);
Tuple2<String,Integer> value = new Tuple2<String,Integer>(user, rating);
return(new Tuple2<String,Tuple2<String,Integer>>(movie,value));
}
})
//按照movie对RDD分组
JavaPairRDD<String,Iterable<Tuple2<String,Integer>>> moviegrouped=movieRDD.groupByKey();
//得出每个电影的评分人数,创建K为user,V为movie rating, numberofraters的RDD
JavaPairRDD<String,Tuple3<String,Integer,Integer>> userRDD= moviegrouped.flatMapToPait(
new PairFlatMapFunction<Tuple2<String, Iterable<Tuple2<String,Integer>>>>,
String, Tuple3<String,Integer,Integer>>(){
public Iterable<Tuple2<String,Tuple3<String,Integer,Integer>>>
call(Tuple2<String,Iterable<Tuple2<String,Integer>>>s){
List<Tuple2<String,Integer>> listofuserandratings= new ArrayList<Tuple2<String,Integer>>();
String movie =s._1;
Iterable<Tuple2<String,Integer>> pairofuserandrating=s._2;
int numerofratings=0;
for(Tuple2<String,Integer> t2:pairofuserandrating){
numberofratings++;
listofuserandrating.add(t2);}
}
List<Tuple2<Sting,Tuple3<String,Integer,Integer>>> results=
new ArrayList<uple2<Sting,Tuple3<String,Integer,Integer>>>();
for(Tuple2<String,Integer> t2:listofuserandrating){
String user=t2._1;
Integer rating =t2._2;
Tuple3<String,Integer,Integer> t3= new Tuple3<String,Integer,Integer>(movie,rating,numberofrating);
results.add(new Tuple2<String,Tuple3<String,Integer,Integer>>(user,t3));
}
return results;
})
//将userRDD与自身相连找出所有所有movie1,movie2对 joinedRDD=userRDD.join(userRDD)
JavaPairRDD<String,Tuple2<Tuple3<String,Integer,Integer>,Tuple3<String,Integer,Integer>>> joinedRDD=
userRDD.join(userRDD);
//删除重复的movie1,movie2对
JavaPairRDD<String,Tuple2<Tuple3<String,Integer,Integer>,Tuple3<String,Integer,Integer>>>=
joinedRDD.filer(new Function<
Tuple2<String,Tuple2<Tuple3<String,Integer,Integer>,Tuple3<String,Integer,Integer>>>,Boolean>(){
public Boolean call(Tuple2<String,Tuple2<Tuple3<String,Integer,Integer>,Tuple3<String,Integer,Integer>>> s){
Tuple3<String,Integer,Integer> movie1=s._2._1;
Tuple3<String,Integer,Integer> movie2=s._2._2;
String moviename1=movie1.first();
String moviename2=movie2.first();
if(moviename1.compareTo(moviename2)<0){retun true;} else {return false;}
}
});
//生成所有的movie1,movie2组合 创建k,v对,k为movie1,movie2组合, value 为相应的计算参数
JavaPairRDD<Tuple<String,String>,Tuple7<Integer,Integer,Integer,Integer,Integer,Integer,Integer>>moviepair=
fileredRDD.mapToPair(new PairFunction<
Tuple2<String,Tuple2<Tuple3<String,Integer,Integer>,Tuple3<String,Integer,Integer>>>,
Tuple2<String,String>,
Tuple7<Integer,Integer,Integer,Integer,Integer,Integer,Integer>>(){
public Tuple2< Tuple2<String,String>, Tuple7<Integer,Integer,Integer,Integer,Integer,Integer,Integer>>call(
Tuple2<String,Tuple2<Tuple3<String,Integer,Integer>,Tuple3<String,Integer,Integer>>>s){
Tuple3<String,Integer,Integer> movie1=s._2._1;
Tuple3<String,Integer,Integer> movie2=s._2._2;
Tuple2<String,String> m1m2key=new Tuple2<String,String>(movie1._1,movie2._1);
int ratingproduct= movie1._2*movie2._2;
int rating1squared=movie1._2*movie1._2;
int rating2squared=movie2._2*movie2._2;
Tuple7<Integer,...Integer> t7=new Tuple7<Integer,...Integer>(movie1._2,movie1._3,movie2._2,movie2._3,
ratingproduct,rating1squared,rating2squared);
return new Tuple2<Tuple2<String,String>,Tuple7<Integer,...Integer>>(m1m2key,t7);
}
});
//按键分组
JavaPairRDD<Tuple2<String,String>,Iterable<Tuple7<Integer,...Integer>>>corrRDD=moviePairs.groupByKey();
//计算每个movie1,movie2的关联度
JavaPairRDD<Tuple2<String,String>,Tuple3<Double,Double,Double>> corr=corrRDD.mapValues(
new Function<Iterable<Tuple7<Integer,...Integer>>,Tuple3<Double,Double,Double>>(){
public Tuple3<Double,Double,Double>call(Iterable<Tuple7<Integer,...Integer>>s){
return calculatecorrelations(s);
}})
//
//
//
//
//
}
}