抓取新浪读书频道的小说

现在的位置: 首页＞Coding＞正文

下篇

抓取新浪读书频道的小说

2012年12月22日 ⁄ Coding ⁄ 暂无评论 ⁄ 被围观 8,249 views+

发现公司很多同事喜欢浏览小说网站，但是看完一章节就要翻页，感觉很不爽，就写了个小程序下载网站的整部小说，程序只是以新浪读书频道为例，进行适当修改后也可以用来抓取其他小说网站的小说了。当然改程序可以扩展一下直接输入小说首页的网址就可以进行下载,懒得整了，以后有时间再整吧。下面贴出源码:

package sinabook;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.URL;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* 使用的JAVA技术有：
* 1.URL类，连接到新浪网
* 2.BufferedReader类，用来读取数据
* 3.Pattern类和Matcher类，使用正则表达式来提取小说的正文
*
* eg.下载《泡沫之夏II》
* 第一章url
* http://vip.book.sina.com.cn/book/chapter_40768_24564.html
* 后记url
* http://vip.book.sina.com.cn/book/chapter_40768_24733.html
*
* 依次输入三个参数：40768,24564,24733，即可下载！
*
* @author Zoio
*
*/

public class GetSinaBook {
public static void main(String[] args) throws IOException {
System.out.println(“u can use this program to download from http://vip.book.sina.com.cn/book”);
System.out.println(“==================For example Start===================”);
System.out.println(“u want to download 泡沫之夏II”);
System.out.println(“the first chapter url is：http://vip.book.sina.com.cn/book/chapter_40768_24564.html”);
System.out.println(“the last chapter url is：http://vip.book.sina.com.cn/book/chapter_40768_24733.html”);
System.out.println(“u must insert:40768,24564,24733″);
System.out.println(“==================For example end=====================”);
System.out.println();

int novelId = 0;
int start = 0;
int end = 0;
Scanner sc = new Scanner(System.in);

System.out.println(“please insert novel id and enter:”);
novelId = sc.nextInt();
System.out.println(“please insert novel start id an enter:”);
start = sc.nextInt();
System.out.println(“please insert novel end id and enter:”);
end = sc.nextInt();
System.out.println(“loading…”);
StringBuffer sb = new StringBuffer();

for(int i = start; isb.append(getParagraphContent(“http://vip.book.sina.com.cn/book/chapter_”+ novelId +”_”,i));
// System.out.println(getParagraphContent(“http://vip.book.sina.com.cn/book/chapter_”+ novelId +”_”,i));
// System.out.println();
}

System.out.println(“please insert novel’name and enter:”);
String novelName = sc.next();
String fileName=”D:” /*+ File.separator + “SinaNovel” */+ File.separator + novelName + “.txt”;
File f=new File(fileName);
OutputStream out =new FileOutputStream(f);
String str= sb.toString();
byte[] b=str.getBytes();
out.write(b);
out.close();
System.out.println(novelName+”下载完毕!请移步至D盘”);
}

private static String getParagraphContent(String url,int index){

int status = 0;
String paragraph = “”;
try {
URL ebook = new URL(url + index + “.html”);
BufferedReader reader = new BufferedReader(new InputStreamReader(ebook.openStream()));
String line;
while ((line = reader.readLine()) != null) {
if (status == 0) {
//标题解析
Pattern pattern = Pattern.compile(“(.*)<h1>(.*)</h1>(.*)”);
Matcher matcher = pattern.matcher(line);
if (matcher.matches()) {
paragraph += matcher.group(2);
paragraph += “\r\n”;
status = 1;
}
}
if (status == 1) {
//解析正文
Pattern pattern = Pattern.compile(“(.*)<div id=\”contTxt\” class=\”contTxt1\”><p>(.*)</p><p></p>(.*)”);
Matcher matcher = pattern.matcher(line);
if (matcher.matches()) {
paragraph += matcher.group(2);
paragraph += “\r\n”;
status = 2;
}
}
}

return paragraph.replaceAll(“</p><p>”, “\r\n”); //替换掉

} catch (Exception e) {
System.out.println(e.toString());
return null;
}
}
}