现在的位置: 首页Coding>正文
2012年12月22日 Coding 暂无评论 ⁄ 被围观 8,028 views+


package sinabook;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.URL;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

* 使用的JAVA技术有:
* 1.URL类,连接到新浪网
* 2.BufferedReader类,用来读取数据
* 3.Pattern类和Matcher类,使用正则表达式来提取小说的正文
* eg.下载《泡沫之夏II》
* 第一章url
* http://vip.book.sina.com.cn/book/chapter_40768_24564.html
* 后记url
* http://vip.book.sina.com.cn/book/chapter_40768_24733.html
* 依次输入三个参数:40768,24564,24733,即可下载!
* @author Zoio

public class GetSinaBook {
public static void main(String[] args) throws IOException {
System.out.println(“u can use this program to download from http://vip.book.sina.com.cn/book”);
System.out.println(“==================For example Start===================”);
System.out.println(“u want to download 泡沫之夏II”);
System.out.println(“the first chapter url is:http://vip.book.sina.com.cn/book/chapter_40768_24564.html”);
System.out.println(“the last chapter url is:http://vip.book.sina.com.cn/book/chapter_40768_24733.html”);
System.out.println(“u must insert:40768,24564,24733″);
System.out.println(“==================For example end=====================”);

int novelId = 0;
int start = 0;
int end = 0;
Scanner sc = new Scanner(System.in);

System.out.println(“please insert novel id and enter:”);
novelId = sc.nextInt();
System.out.println(“please insert novel start id an enter:”);
start = sc.nextInt();
System.out.println(“please insert novel end id and enter:”);
end = sc.nextInt();
StringBuffer sb = new StringBuffer();

for(int i = start; isb.append(getParagraphContent(“http://vip.book.sina.com.cn/book/chapter_”+ novelId +”_”,i));
// System.out.println(getParagraphContent(“http://vip.book.sina.com.cn/book/chapter_”+ novelId +”_”,i));
// System.out.println();

System.out.println(“please insert novel’name and enter:”);
String novelName = sc.next();
String fileName=”D:” /*+ File.separator + “SinaNovel” */+ File.separator + novelName + “.txt”;
File f=new File(fileName);
OutputStream out =new FileOutputStream(f);
String str= sb.toString();
byte[] b=str.getBytes();

private static String getParagraphContent(String url,int index){

int status = 0;
String paragraph = “”;
try {
URL ebook = new URL(url + index + “.html”);
BufferedReader reader = new BufferedReader(new InputStreamReader(ebook.openStream()));
String line;
while ((line = reader.readLine()) != null) {
if (status == 0) {
Pattern pattern = Pattern.compile(“(.*)<h1>(.*)</h1>(.*)”);
Matcher matcher = pattern.matcher(line);
if (matcher.matches()) {
paragraph += matcher.group(2);
paragraph += “\r\n”;
status = 1;
if (status == 1) {
Pattern pattern = Pattern.compile(“(.*)<div id=\”contTxt\” class=\”contTxt1\”><p>(.*)</p><p></p>(.*)”);
Matcher matcher = pattern.matcher(line);
if (matcher.matches()) {
paragraph += matcher.group(2);
paragraph += “\r\n”;
status = 2;

return paragraph.replaceAll(“</p><p>”, “\r\n”); //替换掉

} catch (Exception e) {
return null;


