一:爬取京东数据
package com.esjd.Utils;import lombok.SneakyThrows;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;import java.net.MalformedURLException;
import java.net.URL;public class HtmlParseUtil {@SneakyThrowspublic static void main(String[] args) {//获取请求 https://search.jd.com/Search?keyword=java 需要联网String url = "https://search.jd.com/Search?keyword=java";//解析网页 jsoup返回的 Document就是游览器 Document对象Document document = Jsoup.parse(new URL(url),30000);//获取网页idJ_goodsListElement element = document.getElementById("J_goodsList");System.out.println(element.html());//获取所有的li元素Elements elements = document.getElementsByTag("li");for (Element element1 : elements) {String img = element1.getElementsByTag("img").eq(0).attr("data-lazy-img");String price = element1.getElementsByClass("p-price").eq(0).text();String title = element1.getElementsByClass("p-name").eq(0).text();System.out.println("______________________________________--");System.out.println(img);System.out.println(price);System.out.println(title);}}
}
封装成工具类
@SneakyThrows
public List<Content> paresJD(String keyword){//获取请求 https://search.jd.com/Search?keyword=java 需要联网String urlKeywords = URLEncoder.encode(keyword, "UTF-8");//获取请求 https://search.jd.com/Search?keyword=java//前提: 需要联网, 而且不能获取到AJAX!String url ="https://search.jd.com/Search?keyword=" + urlKeywords + "&enc=utf-8";//解析网页 jsoup返回的 Document就是游览器 Document对象Document document = Jsoup.parse(new URL(url),30000);//获取网页idJ_goodsListElement element = document.getElementById("J_goodsList");//System.out.println(element.html());//获取所有的li元素Elements elements = document.getElementsByTag("li");ArrayList<Content> goodsList = new ArrayList<>();for (Element element1 : elements) {if (element1.attr("class").equalsIgnoreCase("gl-item")) {String img = element1.getElementsByTag("img").eq(0).attr("data-lazy-img");String price = element1.getElementsByClass("p-price").eq(0).text();String title = element1.getElementsByClass("p-name").eq(0).text();Content content = new Content();content.setTitle(title);content.setPrice(price);content.setImg(img);goodsList.add(content);}}return goodsList;
}
编写pojo类
@Data
@AllArgsConstructor
@NoArgsConstructorpublic class Content {//根据业务需求自己添加属性private String title;private String img;private String price;}
解析数据到es中
@Autowired
// 不能直接使用 @Autowired 需要spring容器private RestHighLevelClient restHighLevelClient;//解析数据放入es中public Boolean parseContent(String keywords) throws IOException {List<Content> contents = new HtmlParseUtil().paresJD(keywords);//把查询的数据放入es中BulkRequest bulkRequest = new BulkRequest();bulkRequest.timeout("2m");for (int i = 0; i < contents.size(); i++) {System.out.println(JSON.toJSONString(contents.get(i)));bulkRequest.add(new IndexRequest("jd_goods").source(JSON.toJSONString(contents.get(i)), XContentType.JSON));}BulkResponse bulk = restHighLevelClient.bulk(bulkRequest, RequestOptions.DEFAULT);//判断返回是否成功return !bulk.hasFailures();}}
对应的controller接口
@Autowired
private ContentService contentService;//爬取数据到es中
@GetMapping("/pares/{keyword}")
public Boolean pares(@PathVariable("keyword") String keyword) throws IOException{return contentService.parseContent(keyword);
}
二:前后端分离进行搜索实现
搜索实现和搜索高亮实现
新建前端模板进行请求接口编写
new Vue({el:"#app",data:{keyword: '',results: []},methods:{searchKey(){var keyword = this.keyword;console.log(keyword);//搜索分页// axios.get("search/"+keyword+"/1/10").then(response =>{// console.log(response);// //绑定数据// this.results = response.data;// })//实现搜索高亮axios.get("/HighlightBuilder/"+keyword+"/1/10").then(response =>{console.log(response);//绑定数据this.results = response.data;})}}
})
编写service层
//2. 获取这些数据实现搜索功能public List<Map<String ,Object>> searchPage(String keyword ,int pageNo,int pageSize) throws IOException {if(pageNo<=1){pageNo = 1;}//条件搜索SearchRequest searchRequest = new SearchRequest("jd_goods");SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();//分页sourceBuilder.from(pageNo);sourceBuilder.size(pageSize);//精准匹配TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyword);sourceBuilder.query(termQueryBuilder);sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));//执行搜索searchRequest.source(sourceBuilder);SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);//解析结果ArrayList<Map<String,Object>> list = new ArrayList<>();for (SearchHit documentFields : searchResponse.getHits().getHits()) {//把所有结果遍历出来然后封装到list集合里面list.add( documentFields.getSourceAsMap());}return list;}//2. 获取这些数据实现搜索高亮功能public List<Map<String ,Object>> searchHighlightBuilder(String keyword ,int pageNo,int pageSize) throws IOException {if(pageNo<=1){pageNo = 1;}//条件搜索SearchRequest searchRequest = new SearchRequest("jd_goods");SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();//分页sourceBuilder.from(pageNo);sourceBuilder.size(pageSize);//精准匹配TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyword);sourceBuilder.query(termQueryBuilder);sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));//高亮HighlightBuilder highlightBuilder = new HighlightBuilder();//设置标题高亮highlightBuilder.field("title");//关闭多个高亮字段显示//highlightBuilder.requireFieldMatch(true);//设置高亮样式highlightBuilder.preTags("<span style='color:red'>");highlightBuilder.postTags("</span>");sourceBuilder.highlighter(highlightBuilder);//执行搜索searchRequest.source(sourceBuilder);SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);//解析结果ArrayList<Map<String,Object>> list = new ArrayList<>();for (SearchHit hit : searchResponse.getHits().getHits()) {//解析高亮的字段Map<String, HighlightField> highlightFields = hit.getHighlightFields();//获取标题HighlightField title = highlightFields.get("title");//原来的结果Map<String, Object> sourceAsMap = hit.getSourceAsMap();//解析高亮字段 把原先的字段替换为高亮字段if (title!= null){Text[] fragments = title.fragments();StringBuilder n_title = new StringBuilder();for (Text text : fragments) {n_title.append(text);}sourceAsMap.put("title", n_title.toString());}
// if (title!= null){
// Text[] fragments = title.fragments();
// String n_title = "";
// for (Text text : fragments) {
// n_title+= text;
// }
// sourceAsMap.put("title",n_title);
//
//
// }//把所有结果遍历出来然后封装到list集合里面list.add(sourceAsMap);}return list;}
实现的接口controller
//对数据进行分页
@GetMapping("/search/{keyword}/{pageNo}/{pageSize}")
public List<Map<String,Object>> search(@PathVariable("keyword") String keyword,@PathVariable("pageNo") int pageNo,@PathVariable("pageSize")int pageSize) throws IOException {return contentService.searchPage(keyword, pageNo, pageSize);
}//高亮
@GetMapping("HighlightBuilder/{keyword}/{pageNo}/{pageSize}")
public List<Map<String,Object>> searchHighlightBuilder(@PathVariable("keyword") String keyword,@PathVariable("pageNo") int pageNo,@PathVariable("pageSize")int pageSize) throws IOException {return contentService.searchHighlightBuilder(keyword, pageNo, pageSize);
}
null){
Text[] fragments = title.fragments();
StringBuilder n_title = new StringBuilder();
for (Text text : fragments) {
n_title.append(text);
}
sourceAsMap.put(“title”, n_title.toString());
}
// if (title!= null){
// Text[] fragments = title.fragments();
// String n_title = “”;
// for (Text text : fragments) {
// n_title+= text;
// }
// sourceAsMap.put(“title”,n_title);
//
//
// }
//把所有结果遍历出来然后封装到list集合里面list.add(sourceAsMap);}return list;
}
#### 实现的接口controller
//对数据进行分页
@GetMapping(“/search/{keyword}/{pageNo}/{pageSize}”)
public List<Map<String,Object>> search(@PathVariable(“keyword”) String keyword,
@PathVariable(“pageNo”) int pageNo,
@PathVariable(“pageSize”)int pageSize) throws IOException {
return contentService.searchPage(keyword, pageNo, pageSize);
}
//高亮
@GetMapping(“HighlightBuilder/{keyword}/{pageNo}/{pageSize}”)
public List<Map<String,Object>> searchHighlightBuilder(@PathVariable(“keyword”) String keyword,
@PathVariable(“pageNo”) int pageNo,
@PathVariable(“pageSize”)int pageSize) throws IOException {
return contentService.searchHighlightBuilder(keyword, pageNo, pageSize);
}