java利用htmlparser得到网页html内容,利用org.htmlparser.Parser包我们可以很轻松取到任何页面的源代码,方法如下:
/*** 返回网页内容* * @param path* @return*/public static String getItemDesc() {String htmlStr = "";try {URL url = new URL("http://www.lingshij.com");URLConnection conn = url.openConnection();conn.setConnectTimeout(5000);conn.setReadTimeout(15000);Parser parser = new Parser();parser.setConnection(conn);parser.setEncoding("GBK");TextExtractingVisitor visitor = new TextExtractingVisitor();parser.visitAllNodesWith(visitor);htmlStr = visitor.getExtractedText();} catch (ParserException e) {e.printStackTrace();} catch (MalformedURLException e) {e.printStackTrace();} catch (IOException e) {e.printStackTrace();} catch (Exception e) {e.printStackTrace();}return htmlStr;}