package step1;
import java.io.File;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;public class Task {/*** @param filePath 文件路径:backups/www.ctrip.com.txt/* @return* @throws IOException*/public Document getHtml1(String filePath) throws IOException{/********** Begin **********/File file = new File(filePath);Document d = Jsoup.parse(file, "UTF-8", "/backups/www.ctrip.com.txt/");return d;/********** End **********/} /*** * @param filePath 文件路径:backups/hotels.ctrip.com_domestic-city-hotel.txt/* @return* @throws IOException*/public Document getHtml2(String filePath) throws IOException{/********** Begin **********/File fl = new File(filePath);Document dt = Jsoup.parse(fl, "UTF-8", "/backups/hotels.ctrip.com_domestic-city-hotel.txt/");return dt;/********** End **********/}
}
第2关 解析并提取HTML 元素(一)
package step2;
import java.io.File;
import java.io.IOException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Task {//通过filePath文件路径获取Docment对象public Document getDoc1(String filePath) throws IOException{/********** Begin **********/File file = new File(filePath);Document document = Jsoup.parse(file, "UTF-8","/backups/www.ctrip.com.txt");return document;/********** End **********/}public Document getDoc2(String filePath) throws IOException{/********** Begin **********/File fe = new File(filePath);Document dt = Jsoup.parse(fe, "UTF-8","/backups/you.ctrip.com.txt");return dt;/********** End **********/}//获取所有链接public Elements getLinks(Document doc){/********** Begin **********/return doc.select("link[href]");/********** End **********/}//获取第一个class为“pop_attention”的divpublic Element getDiv(Document doc){/********** Begin **********/return doc.select("div.pop_attention").first();/********** End **********/}//获取所有li之后的i标签public Elements getI(Document doc){/********** Begin **********/return doc.select("li > i");/********** Edn **********/}}
第3关 解析并提取HTML 元素(二)
package step3;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Task {//通过filePath文件路径获取Docment对象public Document getDoc(String filePath) throws IOException{/********** Begin **********/return Jsoup.parse(new File(filePath), "uft-8");/********** End **********/}//获取所有链接public List<String> getLinks(Document doc){/********** Begin **********/Elements select = doc.select("a[href]");List<String> list = new ArrayList<>();for (Element element : select){String temp = element.attr("href");if(!temp.startsWith("http")) temp = "http:" + temp;list.add(element.tagName() + "$" + temp + "(" + element.text() + ")");}return list;/********** End **********/}//获取图片public List<String> getMedia(Document doc){/********** Begin **********/Elements img = doc.select("img");List<String> list = new ArrayList<>();for (Element element : img){String temp = element.attr("src");if(!temp.startsWith("http")) temp = "http:" + temp;list.add(element.tagName() + "$" + temp);}return list;/********** End **********/}//获取link[href]链接public List<String> getImports(Document doc){/********** Begin **********/Elements link = doc.select("link");List<String> list = new ArrayList<>();for (Element value : link){String temp = value.attr("href");if(!temp.startsWith("http")) temp = "http:" + temp;list.add(value.tagName() + "$" + temp + "(" + value.attr("rel") + ")");}return list;/********** End **********/}}
第4关 使用Jsoup抓取携程旅游网全国城市信息
package step4;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Task {//通过filePath文件路径获取Docment对象public Document getDoc(String filePath) throws IOException{/********** Begin **********/File file = new File(filePath);Document doc = Jsoup.parse(file, "uft-8", "/backups/hotels.ctrip.com_domestic-city-hotel.txt");return doc;/********** End **********/}/*** 获取所有城市返回城市信息集合* @param doc * @return*/public List<HotelCity> getAllCitys(Document doc){/********** Begin **********/Elements select1 = doc.select("dl.layoutfix");List<HotelCity> list = new ArrayList<>();for (Element element : select1.select("a")){HotelCity hotelCity = new HotelCity();hotelCity.setCityId(element.attr("href").replaceAll("[^(0-9)]", ""));hotelCity.setCityName(element.text());hotelCity.setPinyin(element.attr("href").split("/")[2].replaceAll("[^(a-zA-Z)]",""));hotelCity.setHeadPinyin("A");list.add(hotelCity);}return list;/********** End **********/}
}