抓取新浪微博 相册图片

package com.sxit;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Scanner;

import org.apache.commons.codec.binary.Base64;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;

/**
 * @功能:单线程 抓取新浪微博 相册图片
 * @作者: smile
 * @时间:2013-2-3 下午10:56:33
 * @版本:1.0
 */
public class MoPic {

	private final static HttpClient client = new DefaultHttpClient();
	// 存放图片地址
	private static List<String> picList = new ArrayList<String>();

	public static void main(String[] args) {

		try {
			Scanner scan = new Scanner(System.in);
			System.out.println("请输入你的用户名:");
			String username = scan.nextLine();
			System.out.println("请输入你的密码:");
			String password = scan.nextLine();
			System.out.println("请输入目标用户的用户名:");
			String targetname = scan.nextLine();
			System.out.println("请输入需要下载的相片数量:");
			int count = Integer.parseInt(scan.nextLine());
			//登入
			login(username, password, targetname, count);
			//下载
			upload(targetname);
		} catch (IOException e) {
			e.printStackTrace();
		} catch (JSONException e) {
			e.printStackTrace();
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	/**
	 * @功能:下载图片
	 * @时间:2013-2-4 上午11:00:37
	 */
	public static void upload(String targetname) throws Exception {

		InputStream is = null;
		OutputStream os = null;
		URL url = null;
		HttpURLConnection con = null;

		// 判断保存路径是否存在 不存在则新建文件夹
		File f = new File("E:\\tmp\\"+targetname);
		if (!f.exists()) {
			f.mkdir();
		}

		if (picList != null) {

			for (int i = picList.size() - 1; i >= 0; i--) {
				try {
					String p_url = picList.get(i);
					if (p_url != null && !"".equals(p_url)) {

						url = new URL(p_url);
						//截取后缀
						int index = p_url.lastIndexOf(".");
						System.out.println("索引位:"+index);
						String pos = p_url.substring(index);
						System.out.println("后缀为:"+pos);
						con = (HttpURLConnection) url.openConnection();
						// 设置连接超时
						con.setConnectTimeout(100 * 1000);
						// 设置读取超时
						con.setReadTimeout(100 * 1000);
						is = new BufferedInputStream(con.getInputStream());
						os = new BufferedOutputStream(new FileOutputStream(new File("E:/tmp/"+targetname+"/" + i + pos)));
						byte[] b = new byte[1024];
						int length = 0;
						while ((length = is.read(b)) != -1) {
							os.write(b, 0, length);
						}
						os.flush();
						System.out.println("下载完第" + i + "张图");
					}
				}catch (Exception e) {
					continue;
				}
			}
		}else{
			System.out.println("无相片信息!");
		}
	}

	// 登入新浪微博
	public static void login(String username, String password, String targetName, int pCount) throws IOException, JSONException {

		HttpPost post = new HttpPost("http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.5)");
		post.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0");
		post.setHeader("Referer", "http://weibo.com/");
		post.setHeader("Content-Type", "application/x-www-form-urlencoded");

		String data = getServerTime();
		String nonce = makeNonce(6);

		// 登录表单的信息
		List<NameValuePair> qparams = new ArrayList<NameValuePair>();
		qparams.add(new BasicNameValuePair("entry", "weibo"));
		qparams.add(new BasicNameValuePair("gateway", "1"));
		qparams.add(new BasicNameValuePair("from", ""));
		qparams.add(new BasicNameValuePair("savestate", "0"));
		qparams.add(new BasicNameValuePair("useticket", "1"));
		qparams.add(new BasicNameValuePair("pagerefer", ""));
		qparams.add(new BasicNameValuePair("service", "miniblog"));
		qparams.add(new BasicNameValuePair("servertime", data));
		qparams.add(new BasicNameValuePair("nonce", nonce));
		qparams.add(new BasicNameValuePair("pwencode", "wsse"));
		qparams.add(new BasicNameValuePair("encoding", "UTF-8"));
		qparams.add(new BasicNameValuePair("url", "http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack"));
		qparams.add(new BasicNameValuePair("returntype", "META"));
		// 用户名处理
		qparams.add(new BasicNameValuePair("su", encodeAccount(username)));
		qparams.add(new BasicNameValuePair("sp", new SinaSSOEncoder().encode(password, data, nonce)));

		UrlEncodedFormEntity params = new UrlEncodedFormEntity(qparams, "utf-8");
		post.setEntity(params);

		HttpResponse response = client.execute(post);
		String entity = EntityUtils.toString(response.getEntity());
		System.out.println("entity为:" + entity);
		String url = entity.substring(entity.indexOf("http%3A%2F%2Fweibo.com%2Fajaxlogin.php"), entity.indexOf("code=0") + 6);
		url = URLDecoder.decode(url);
		System.out.println("真实地址为:" + url);

		// 获取到实际url进行连接
		HttpGet getMethod = new HttpGet(url);
		response = client.execute(getMethod);

		entity = EntityUtils.toString(response.getEntity());
		System.out.println("----->>>" + entity);
		entity = entity.substring(entity.indexOf("userdomain") + 13, entity.lastIndexOf("\""));
		System.out.println("......." + entity);

		getMethod = new HttpGet("http://weibo.com/" + entity);
		response = client.execute(getMethod);
		String uid = EntityUtils.toString(response.getEntity());
		uid = uid.substring(uid.indexOf("oid") + 9, uid.lastIndexOf("$CONFIG['onick']") - 3);
		// 这里获取的是登入用户的uid
		System.out.println(uid);

		// 这里去访问别的用户的微博 输入用户名 比如:bearsun
		getMethod = new HttpGet("http://weibo.com/" + targetName);
		response = client.execute(getMethod);
		String pid = EntityUtils.toString(response.getEntity());
		pid = pid.substring(pid.indexOf("oid") + 9, pid.lastIndexOf("$CONFIG['onick']") - 3);
		// 访问目标用户的pid
		System.out.println(pid);

		// 这里只取微博配图中的图片http://photo.weibo.com/1511804135/talbum/index?from=profile_wb
		getMethod = new HttpGet("http://photo.weibo.com/" + pid + "/talbum/index?from=profile_wb");
		response = client.execute(getMethod);
		String albumId = EntityUtils.toString(response.getEntity());
		albumId = albumId.substring(albumId.indexOf("album_id") + 9, albumId.indexOf("album_info") - 36);
		// 相册id
		System.out.println(albumId);

		// http://ww3.sinaimg.cn/mw690/6fb242fdjw1dzke8vygnwj.jpg
		// http://photo.weibo.com/photos/get_all?uid=1511804135&album_id=14503807&count=32&page=1&type=3
		getMethod = new HttpGet("http://photo.weibo.com/photos/get_all?uid=" + pid + "&album_id=" + albumId + "&count=" + pCount + "&page=1&type=3");
		response = client.execute(getMethod);
		// 返回的是一个json数组
		entity = EntityUtils.toString(response.getEntity());
		JSONObject a = new JSONObject(entity);
		// 获取图片信息json数组
		System.out.println(a.get("data").toString());
		JSONArray list = new JSONObject(a.get("data").toString()).getJSONArray("photo_list");
		for (int i = 0; i < list.length(); i++) {
			JSONObject temp = (JSONObject) list.get(i);
			String pic_name = "http://ww3.sinaimg.cn/mw690/" + temp.getString("pic_name");
			System.out.println(pic_name);
			picList.add(pic_name);
		}
	}

	// 登入账号处理
	private static String encodeAccount(String account) {
		String userName = "";
		try {
			userName = Base64.encodeBase64String(URLEncoder.encode(account, "UTF-8").getBytes());
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		}
		return userName;
	}

	private static String makeNonce(int len) {
		String x = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
		String str = "";
		for (int i = 0; i < len; i++) {
			str += x.charAt((int) (Math.ceil(Math.random() * 1000000) % x.length()));
		}
		return str;
	}

	private static String getServerTime() {
		long servertime = new Date().getTime() / 1000;
		return String.valueOf(servertime);
	}

}
package com.sxit;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Scanner;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import org.apache.commons.codec.binary.Base64;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;

/**
 * @功能:多线程抓取新浪微博 相册图片
 * @作者: smile
 * @时间:2013-2-3 下午10:56:33
 * @版本:1.0
 */
public class MoPicThread {

	private final static HttpClient client = new DefaultHttpClient();
	// 存放图片地址
	private static List<String> picList = new ArrayList<String>();

	public static void main(String[] args) {

		try {
			Scanner scan = new Scanner(System.in);
			System.out.println("请输入你的用户名:");
			String username = scan.nextLine();
			System.out.println("请输入你的密码:");
			String password = scan.nextLine();
			System.out.println("请输入目标用户的用户名:");
			String targetname = scan.nextLine();
			System.out.println("请输入需要下载的相片数量:");
			int count = Integer.parseInt(scan.nextLine());
			// 登入
			login(username, password, targetname, count);
			// 下载
			upload(targetname);
		} catch (IOException e) {
			e.printStackTrace();
		} catch (JSONException e) {
			e.printStackTrace();
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	/**
	 * @功能:下载图片
	 * @时间:2013-2-4 上午11:00:37
	 */
	public static void upload(String targetname) throws Exception {

		// 建立线程池
		ExecutorService executor = Executors.newFixedThreadPool(10);

		// 判断保存路径是否存在 不存在则新建文件夹
		File f = new File("E:\\tmp\\" + targetname);
		if (!f.exists()) {
			f.mkdir();
		}

		if (picList != null) {
			// 将图片分段下载
			for (int i = 1,count = picList.size() / 20; i <= count; i++) {
				int start = (i - 1) * 20;
				int end = 0;
				if (i != 20) {
					end = i*20-1;
				} else {
					end = picList.size()-1;
				}
				
				ImagThread thread = new ImagThread(start, end, targetname);
				executor.submit(thread);
			}
			executor.shutdown();
		} else {
			System.out.println("无相片信息!");
		}
	}

	/**
	 * @功能:多线程下载图片到本地
	 * @时间:2013-2-4 下午1:59:02
	 */
	static class ImagThread implements Runnable {

		// 起始
		private int start;
		// 终止
		private int end;
		// 目标用户名
		private String targetname;

		public ImagThread(int start, int end, String targetname) {
			this.start = start;
			this.end = end;
			this.targetname = targetname;
		}

		public void run() {
			for (int i = start; i <= end; i++) {
				try {
					uploadImag(i, targetname, picList.get(i));
				} catch (Exception e) {
					System.out.println("第" + i + "张图片下载失败,地址为:" + picList.get(i));
					continue;
				}
				System.out.println("线程"+Thread.currentThread().getName()+"下载完第"+i+"张图片");
			}
		}
	}

	/**
	 * @功能:下载单个图片到本地
	 */
	public static void uploadImag(int i, String targetname, String p_url) throws Exception {

		InputStream is = null;
		OutputStream os = null;
		URL url = null;
		HttpURLConnection con = null;
		try {
			url = new URL(p_url);
			// 截取后缀
			int index = p_url.lastIndexOf(".");
			String pos = p_url.substring(index);
			con = (HttpURLConnection) url.openConnection();
			// 设置连接超时
			con.setConnectTimeout(100 * 1000);
			// 设置读取超时
			con.setReadTimeout(100 * 1000);
			is = new BufferedInputStream(con.getInputStream());
			os = new BufferedOutputStream(new FileOutputStream(new File("E:/tmp/" + targetname + "/" + i + pos)));
			byte[] b = new byte[1024];
			int length = 0;
			while ((length = is.read(b)) != -1) {
				os.write(b, 0, length);
			}
			os.flush();
		} finally {
			is.close();
			os.close();
		}
	}

	// 登入新浪微博
	public static void login(String username, String password, String targetName, int pCount) throws IOException, JSONException {

		HttpPost post = new HttpPost("http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.5)");
		post.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0");
		post.setHeader("Referer", "http://weibo.com/");
		post.setHeader("Content-Type", "application/x-www-form-urlencoded");

		String data = getServerTime();
		String nonce = makeNonce(6);

		// 登录表单的信息
		List<NameValuePair> qparams = new ArrayList<NameValuePair>();
		qparams.add(new BasicNameValuePair("entry", "weibo"));
		qparams.add(new BasicNameValuePair("gateway", "1"));
		qparams.add(new BasicNameValuePair("from", ""));
		qparams.add(new BasicNameValuePair("savestate", "0"));
		qparams.add(new BasicNameValuePair("useticket", "1"));
		qparams.add(new BasicNameValuePair("pagerefer", ""));
		qparams.add(new BasicNameValuePair("service", "miniblog"));
		qparams.add(new BasicNameValuePair("servertime", data));
		qparams.add(new BasicNameValuePair("nonce", nonce));
		qparams.add(new BasicNameValuePair("pwencode", "wsse"));
		qparams.add(new BasicNameValuePair("encoding", "UTF-8"));
		qparams.add(new BasicNameValuePair("url", "http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack"));
		qparams.add(new BasicNameValuePair("returntype", "META"));
		// 用户名处理
		qparams.add(new BasicNameValuePair("su", encodeAccount(username)));
		qparams.add(new BasicNameValuePair("sp", new SinaSSOEncoder().encode(password, data, nonce)));

		UrlEncodedFormEntity params = new UrlEncodedFormEntity(qparams, "utf-8");
		post.setEntity(params);

		HttpResponse response = client.execute(post);
		String entity = EntityUtils.toString(response.getEntity());
		System.out.println("entity为:" + entity);
		String url = entity.substring(entity.indexOf("http%3A%2F%2Fweibo.com%2Fajaxlogin.php"), entity.indexOf("code=0") + 6);
		url = URLDecoder.decode(url);
		System.out.println("真实地址为:" + url);

		// 获取到实际url进行连接
		HttpGet getMethod = new HttpGet(url);
		response = client.execute(getMethod);

		entity = EntityUtils.toString(response.getEntity());
		System.out.println("----->>>" + entity);
		entity = entity.substring(entity.indexOf("userdomain") + 13, entity.lastIndexOf("\""));
		System.out.println("......." + entity);

		getMethod = new HttpGet("http://weibo.com/" + entity);
		response = client.execute(getMethod);
		String uid = EntityUtils.toString(response.getEntity());
		uid = uid.substring(uid.indexOf("oid") + 9, uid.lastIndexOf("$CONFIG['onick']") - 3);
		// 这里获取的是登入用户的uid
		System.out.println(uid);

		// 这里去访问别的用户的微博 输入用户名 比如:bearsun
		getMethod = new HttpGet("http://weibo.com/" + targetName);
		response = client.execute(getMethod);
		String pid = EntityUtils.toString(response.getEntity());
		pid = pid.substring(pid.indexOf("oid") + 9, pid.lastIndexOf("$CONFIG['onick']") - 3);
		// 访问目标用户的pid
		System.out.println(pid);

		// 这里只取微博配图中的图片http://photo.weibo.com/1511804135/talbum/index?from=profile_wb
		getMethod = new HttpGet("http://photo.weibo.com/" + pid + "/talbum/index?from=profile_wb");
		response = client.execute(getMethod);
		String albumId = EntityUtils.toString(response.getEntity());
		albumId = albumId.substring(albumId.indexOf("album_id") + 9, albumId.indexOf("album_info") - 36);
		// 相册id
		System.out.println(albumId);

		// http://ww3.sinaimg.cn/mw690/6fb242fdjw1dzke8vygnwj.jpg
		// http://photo.weibo.com/photos/get_all?uid=1511804135&album_id=14503807&count=32&page=1&type=3
		getMethod = new HttpGet("http://photo.weibo.com/photos/get_all?uid=" + pid + "&album_id=" + albumId + "&count=" + pCount + "&page=1&type=3");
		response = client.execute(getMethod);
		// 返回的是一个json数组
		entity = EntityUtils.toString(response.getEntity());
		JSONObject a = new JSONObject(entity);
		// 获取图片信息json数组
		System.out.println(a.get("data").toString());
		JSONArray list = new JSONObject(a.get("data").toString()).getJSONArray("photo_list");
		for (int i = 0; i < list.length(); i++) {
			JSONObject temp = (JSONObject) list.get(i);
			String pic_name = "http://ww3.sinaimg.cn/mw690/" + temp.getString("pic_name");
			System.out.println(pic_name);
			picList.add(pic_name);
		}
	}

	// 登入账号处理
	private static String encodeAccount(String account) {
		String userName = "";
		try {
			userName = Base64.encodeBase64String(URLEncoder.encode(account, "UTF-8").getBytes());
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		}
		return userName;
	}

	private static String makeNonce(int len) {
		String x = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
		String str = "";
		for (int i = 0; i < len; i++) {
			str += x.charAt((int) (Math.ceil(Math.random() * 1000000) % x.length()));
		}
		return str;
	}

	private static String getServerTime() {
		long servertime = new Date().getTime() / 1000;
		return String.valueOf(servertime);
	}

}

猜你喜欢

转载自luan.iteye.com/blog/1787769