网页元素解析元素标签和style变更

前言

如何解析html标签？
如何给标签增加样式？

<div class="related-tags"><span>相关主题推荐：</span>a<a href='http://www.csdn.net/tag/标签' target="_blank">标签</a><a href="http://www.csdn.net/tag/java" target="_blank">java</a><a href="http://www.csdn.net/tag/团购" target="_blank">团购</a><a href="http://www.csdn.net/tag/体育" target="_blank">体育</a><a href="http://www.csdn.net/tag/搜狐" icon='a' target="_blank">搜狐</a>
<img src="/i/eg_tulip2.jpg"  alt="上海鲜花港 - 郁金香"><img src="/i/eg_tulip.jpg"  alt="上海鲜花港 - 郁金香" />  </div>

示例，需要提取这段html代码的a标签的内容，并且想改变style样式，如何处理？

实现代码

先定义网页节点，就节点的组成元素

import java.io.Serializable;
import java.util.List;/*** 网页节点** @author leng*/
public class WebNode implements Serializable {/*** 节点htmlString*/private String htmlString;/*** 节点中间的内容*/private String content;/*** 标签类型*/private String labelType;/*** 属性区的html内容*/private String attributeHtmlString;/*** 是否单节点*/private boolean isSingleNode;/*** 属性*/private List<AttributeHtml> attributeHtmlList;public String getHtmlString() {return htmlString;}public void setHtmlString(String htmlString) {this.htmlString = htmlString;}public String getContent() {return content;}public void setContent(String content) {this.content = content;}public String getAttributeHtmlString() {return attributeHtmlString;}public void setAttributeHtmlString(String attributeHtmlString) {this.attributeHtmlString = attributeHtmlString;}public String getLabelType() {return labelType;}public void setLabelType(String labelType) {this.labelType = labelType;}public boolean isSingleNode() {return isSingleNode;}public void setSingleNode(boolean isSingleNode) {this.isSingleNode = isSingleNode;}public List<AttributeHtml> getAttributeHtmlList() {return attributeHtmlList;}public void setAttributeHtmlList(List<AttributeHtml> attributeHtmlList) {this.attributeHtmlList = attributeHtmlList;}/*** 获取样式的属性** @return*/public AttributeHtml getStyle() {for (AttributeHtml attributeHtml : attributeHtmlList) {if (attributeHtml.getKey().equals("style")) {return attributeHtml;}}return null;}/*** 属性*/static class AttributeHtml {/*** 内容*/private String html;/*** 属性的key*/private String key;/*** 属性的值*/private String value;public String getHtml() {return html;}public void setHtml(String html) {this.html = html;}public String getKey() {return key;}public void setKey(String key) {this.key = key;}public String getValue() {return value;}public void setValue(String value) {this.value = value;}}
}

定义解析的核心

import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang.StringUtils;
import org.springframework.beans.BeanUtils;import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;/*** 网页工具*/
@Slf4j
public class WebpageUtil {/*** 从网页中获取标题** @param html* @return*/public static String getTitleFromHtmlString(String html) {int startIndex = html.indexOf("<title>");int endIndex = html.indexOf("</title>");if (startIndex == -1 || endIndex == -1) {return null;}String title = html.substring(startIndex + ("<title>".length()), endIndex);return title;}/*** 获取节点(不支持节点内包含节点)** @param html* @return*/public static List<String> getNode(String html, String nodeType) {String endStr = "</" + nodeType + ">";String regex;if (html.indexOf(endStr) > -1) {regex = "<a(\\s+\\w+\\s*=\\s*(\"|\')?(.*?)[\"|\'])*>(.*?)</a>";// 成对标签的元素(例如<a></a>)} else {regex = "<a(\\s+\\w+\\s*=\\s*(\"|\')?(.*?)[\"|\'|\\s])*\\s*\\/?>";// 单标签的元素(例如<img>)}regex = regex.replaceAll("a", nodeType);Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);Matcher m = p.matcher(html);ArrayList<String> list = new ArrayList<String>();while (m.find()) {String link = m.group(0).trim();list.add(link);}return list;}/*** 获取节点** @param html* @return*/public static List<WebNode> getNode2(String html, String labelType) {String endStr = "</" + labelType + ">";String regex;boolean isSingleNode = false;if (html.indexOf(endStr) > -1) {regex = "<a(\\s+\\w+\\s*=\\s*(\"|\')?(.*?)[\"|\'])*>(.*?)</a>";// 成对标签的元素(例如<a></a>)} else {isSingleNode = true;regex = "<a(\\s+\\w+\\s*=\\s*(\"|\')?(.*?)[\"|\'|\\s])*\\s*\\/?>";// 单标签的元素(例如<img>)}regex = regex.replaceAll("a", labelType);Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);Matcher m = p.matcher(html);List<WebNode> list = new ArrayList<>();while (m.find()) {WebNode node = new WebNode();node.setLabelType(labelType);node.setSingleNode(isSingleNode);String link = m.group(0).trim();node.setHtmlString(link);if (!isSingleNode) {int startIndex = link.substring(0, link.length() - 2).lastIndexOf(">");int endIndex = link.lastIndexOf("<");if (startIndex > -1 && endIndex > -1 && endIndex > startIndex) {String content = link.substring(startIndex + 1, endIndex);node.setContent(content);}}int startIndex2 = link.indexOf("<" + labelType);String str = null;if (isSingleNode) {str = link.substring(startIndex2 + ("<" + labelType).length());} else {int startIndex = link.substring(0, link.length() - 2).lastIndexOf(">");str = link.substring(startIndex2 + ("<" + labelType).length(), startIndex);}node.setAttributeHtmlString(str);
//            LinkedHashMap<String, String> params = new LinkedHashMap<>();
//            List<String> attributeList = new ArrayList<>();List<WebNode.AttributeHtml> attributeHtmls = new ArrayList<>();String[] arr = StrUtil.trim(str, "=").split(" ");if (null != arr && arr.length > 0) {for (String s : arr) {if (s.indexOf("=") > -1) {String key = s.substring(0, s.indexOf("="));String value = s.substring(s.indexOf("=") + 1);String k = key, v = value;if (key.startsWith("\'") || key.startsWith("\"")) {k = key.substring(1, key.length() - 1).trim();}if (value.startsWith("\'") || value.startsWith("\"")) {v = value.substring(1, value.length() - 1).trim();}WebNode.AttributeHtml attributeHtml = new WebNode.AttributeHtml();attributeHtml.setKey(k);attributeHtml.setValue(v);attributeHtml.setHtml(s);attributeHtmls.add(attributeHtml);
//                        params.put(k, v);}}}
//            node.setAttributeHtmlList(attributeList);
//            node.setAttributeMap(params);node.setAttributeHtmlList(attributeHtmls);list.add(node);}return list;}/*** 改变样式,会覆盖原来的节点** @param webNode* @param newStyle 新样式会覆盖旧的样式,不要写style，直接传入样式内容就可以了* @return*/public static WebNode changeStyle(WebNode webNode, LinkedHashMap<String, String> newStyle) throws Exception {if (null == webNode || null == newStyle || newStyle.size() == 0) {return null;}StringBuffer stringBuffer = new StringBuffer();newStyle.forEach((k, v) -> {if (null != v) {stringBuffer.append(k).append(":").append(v);}});List<WebNode.AttributeHtml> attributeHtmlList = webNode.getAttributeHtmlList();String htmlString = webNode.getHtmlString();String labelType = webNode.getLabelType();WebNode.AttributeHtml oldStyle = webNode.getStyle();String str = null;String newStyleStr = "style=\"" + stringBuffer.toString() + "\"";//新样式要拼接if (null == oldStyle) {//原来没有就增加样式,标签的尾部，就是>的前面str = htmlString.replaceFirst(">", " " + newStyleStr + ">");} else {//取代旧的样式String html = oldStyle.getHtml();str = htmlString.replaceFirst(html, newStyleStr);}List<WebNode> node2 = getNode2(str, labelType);try {WebNode webNode2 = node2.get(0);BeanUtils.copyProperties(webNode2, webNode);return webNode;} catch (Exception e) {log.error("解析标签发生错误:" + str, e);throw new Exception("解析标签发生错误");}}/*** 添加样式，如果原来style，则新增，如果有替换同样的样式** @param webNode* @param newStyle 新增的样式* @return*/public static WebNode addStyle(WebNode webNode, LinkedHashMap<String, String> newStyle) throws Exception {if (null == webNode || null == newStyle || newStyle.size() == 0) {return null;}List<WebNode.AttributeHtml> attributeHtmlList = webNode.getAttributeHtmlList();String htmlString = webNode.getHtmlString();String labelType = webNode.getLabelType();WebNode.AttributeHtml oldStyle = webNode.getStyle();String str = "";if (null == oldStyle) {StringBuffer stringBuffer = new StringBuffer();newStyle.forEach((k, v) -> {if (null != v) {stringBuffer.append(k).append(":").append(v).append(";");}});String newStyleStr = "style=\"" + stringBuffer.toString() + "\"";//新样式要拼接//原来没有就增加样式,标签的尾部，就是>的前面str = htmlString.replaceFirst(">", " " + newStyleStr + ">");} else {//取代旧的样式String html = oldStyle.getHtml();String styleContent = oldStyle.getValue();str = htmlString;LinkedHashMap<String, String> oldStyleMap = StrUtil.stringToMap(styleContent, ";", ":");oldStyleMap = null == oldStyleMap ? new LinkedHashMap<>() : oldStyleMap;//变更样式Iterator<Map.Entry<String, String>> iterator = newStyle.entrySet().iterator();StringBuffer addStyleSb = new StringBuffer();LinkedHashMap<String, String> replaceList = new LinkedHashMap<>();while (iterator.hasNext()) {Map.Entry<String, String> next = iterator.next();String k = next.getKey();if (StringUtils.isEmpty(k)) {continue;}String v = next.getValue();String oldStyleAttr = oldStyleMap.get(k);if (null != oldStyleAttr) {String oldSt = k + ":" + oldStyleAttr;String newSt = k + ":" + v;if (StringUtils.isEmpty(v)) {
//                        str = str.replaceFirst(oldSt, "");replaceList.put(oldSt, "");//如果不设置值，则表示清除样式} else {
//                        str = str.replaceFirst(oldSt, newSt);replaceList.put(oldSt, newSt);}} else {if (StringUtils.isNotEmpty(v)) {addStyleSb.append(k).append(":").append(v).append(";");}}}//先追加新样式if (addStyleSb.length() > 0) {String oldSt = oldStyle.getHtml();String newSt = oldSt.substring(0, oldSt.length() - 1) + addStyleSb + "\"";str = str.replaceFirst(oldSt, newSt);}//再替换已有样式,如果顺序执行反了，就会导致无法使用样式错误Iterator<Map.Entry<String, String>> iterator1 = replaceList.entrySet().iterator();while (iterator1.hasNext()) {Map.Entry<String, String> next = iterator1.next();String k = next.getKey();String v = next.getValue();str = str.replaceFirst(k, v);}}List<WebNode> node2 = getNode2(str, labelType);try {WebNode webNode2 = node2.get(0);BeanUtils.copyProperties(webNode2, webNode);return webNode;} catch (Exception e) {log.error("解析标签发生错误:" + str, e);throw new Exception("解析标签发生错误");}}
}

其它工具类

import org.apache.commons.lang3.StringUtils;import java.util.*;/*** @Description 描述* @Author 作者* @Date 2024/3/29* @return*/
public class StrUtil {/*** 获取两个字符串之间的字符串内容** @param sourceString 源字符串* @param startString  起始字符串* @param endString    终止字符串(从起始字符串开始计算),如果找不到，则返回null* @return* @author lengchengzhen* @date 2024-3-29*/public static String getBetweenString(String sourceString, String startString, String endString) {if (StringUtils.isEmpty(sourceString) || StringUtils.isEmpty(startString) || StringUtils.isEmpty(endString)) {return null;}int stIndex = sourceString.indexOf(startString);if (stIndex == -1) {return null;}String str2 = sourceString.substring(stIndex + startString.length());int endIndex = str2.indexOf(endString);if (endIndex == -1) {return null;}return str2.substring(0, endIndex);}public static boolean isEmpty(String string){return null==string||string.equals("");}public static boolean isNotEmpty(String string){return null!=string&&!string.equals("");}/*** 将字符串string重复n次,通过seg进行分割连接** @param string 需要重复的字符串* @param n      重复次数* @param seg    分割字符串,如果等于null,则为""* @return*/public static String repeatString(String string, int n, String seg) {if (null == seg) {seg = "";}/*** 并非不用其他方法实现,因为这种方法是性能最好的写法*/StringBuffer sb = new StringBuffer();for (int i = 0; i < n; i++) {sb.append(string).append(seg);}return sb.substring(0, sb.length() - seg.length());// return String.join(seg, Collections.nCopies(n, string));}/*** 将num重复n次,通过seg进行分割符号连接** @param num 需要重复的数值* @param n   重复次数* @param seg 分割字符串,如果等于null,则为""* @return*/public static String repeatString(int num, int n, String seg) {return repeatString(num + "", n, seg);}/*** 将num重复n次,分割字符串为英文逗号","** @param num 需要重复的数值* @param n   重复次数* @return*/public static String repeatString(int num, int n) {return repeatString(num + "", n, ",");}/*** 将int数组转换成字符串** @param intArray* @return*/public static String IntArrayToString(Integer[] intArray) {if (null != intArray) {StringBuffer sb = new StringBuffer();for (Integer a : intArray) {if (null != a) {sb.append(",").append(a);}}if (sb.length() > 0) {return sb.substring(1);}}return null;}/*** 字符串str是否存在于字符串array数组中** @param array      字符串数组* @param str        字符串* @param ignoreCase 是否忽略大小写* @return*/public static boolean isExist(String[] array, String str, boolean ignoreCase) {if (null != array && null != str) {for (String s : array) {if (ignoreCase) {if (s.equalsIgnoreCase(str)) {return true;}} else {if (s.equals(str)) {return true;}}}}return false;}/*** 将字符串按照splitStr进行分割,并转换成{@code List<Integer>}集合** @param string* @param splitStr* @return*/public static List<Integer> parseList(String string, String splitStr) {if (isEmpty(string)) {return null;}String[] arrays = string.split(splitStr);List<Integer> list = new ArrayList<>(arrays.length);for (String str : arrays) {try {if (isNotEmpty(str)) {Integer num = Integer.parseInt(str);list.add(num);}} catch (Exception e) {}}return list;}/*** 将字符串按照逗号,进行分割,并转换成{@code List<Integer>}集合** @param string* @return*/public static List<Integer> parseList(String string) {return parseList(string, ",");}/*** 将字符串按照splitStr进行分割,并转换成{@code Set<Integer>}集合** @param string* @param splitStr* @return*/public static Set<Integer> parseSet(String string, String splitStr) {if (isEmpty(string)) {return null;}String[] arrays = string.split(splitStr);Set<Integer> set = new HashSet<>();for (String str : arrays) {try {if (isNotEmpty(str)) {Integer num = Integer.parseInt(str);set.add(num);}} catch (Exception e) {}}return set;}/*** 将字符串按照逗号,进行分割,并转换成{@code Set<Integer>}集合** @param string* @return*/public static Set<Integer> parseSet(String string) {return parseSet(string, ",");}/*** 将字符串按照splitStr进行分割,并转换成{@code Set<String>}集合** @param string* @param splitStr* @return*/public static Set<String> parseSetString(String string, String splitStr) {if (isEmpty(string)) {return null;}String[] arrays = string.split(splitStr);Set<String> set = new HashSet<>();for (String str : arrays) {try {set.add(str);} catch (Exception e) {}}return set;}/*** 将字符串转换成map,示例 string="1-2,3-4,5-6",seg1=",",seg2="-" 转换为map后,{1=2,3=4,5=6}** @param string 被切割的字符串* @param seg1   分割符号1   示例:,* @param seg2   分割符号2   示例: -* @return*/public static LinkedHashMap<String, String> stringToMap(String string, String seg1, String seg2) {if (isNotEmpty(string)) {String[] arr = string.split(seg1);LinkedHashMap<String, String> map = new LinkedHashMap<>(arr.length);for (String s : arr) {if (isNotEmpty(s)) {String[] arr2 = s.split(seg2);if (arr2.length == 1) {map.put(arr2[0], null);} else if (arr2.length >= 2) {map.put(arr2[0], arr2[1]);}}}return map;}return null;}/*** 将字符串转换成map,示例 string="1-2,3-4,5-6" 转换为map后,{1=2,3=4,5=6}** @param string* @return*/public static LinkedHashMap<String, String> stringToMap(String string) {return stringToMap(string, ",", "-");}/*** 替换首逗号,如果字符串的开头不是逗号,则原样返还** @param str* @return*/public static String repaceFirstComma(String str) {if (null == str) {return null;}if (str.startsWith(",")) {return str.substring(1);}return str;}/*** 字符串数组转int数组** @param array* @return*/public static int[] strArrayToIntArray(String[] array) {int[] intArray = new int[0];for (String s : array) {if (isNotEmpty(s)) {intArray = Arrays.copyOf(intArray, intArray.length + 1);intArray[intArray.length - 1] = Integer.parseInt(s);}}return intArray;}/*** 去掉指定字符前后的空格** @param string* @param specialStr* @return*/public static String trim(String string, String specialStr) {if (null == string) {return null;}String s1 = " " + specialStr;while (string.indexOf(s1) > -1) {string = string.replaceAll(s1, specialStr);}String s2 = specialStr + " ";while (string.indexOf(s2) > -1) {string = string.replaceAll(s2, specialStr);}return string;}
}

测试示例

@Testpublic void testWebNode() throws Exception {String str = "<div class=\"related-tags\">\n" + "    <span>相关主题推荐：</span>\n"+ "      a<a href=\'http://www.csdn.net/tag/标签\' target=\"_blank\">标签</a>\n"+ "      <a href=\"http://www.csdn.net/tag/java\" target=\"_blank\">java</a>\n"+ "      <a href=\"http://www.csdn.net/tag/团购\" target=\"_blank\">团购</a>\n"+ "      <a href=\"http://www.csdn.net/tag/体育\" target=\"_blank\">体育</a>\n"+ "      <a href=\"http://www.csdn.net/tag/搜狐\" icon='a' target=\"_blank\">搜狐</a>\n"+ "<img src=\"/i/eg_tulip2.jpg\"  alt=\"上海鲜花港 - 郁金香\"><img src=\"/i/eg_tulip.jpg\"  alt=\"上海鲜花港 - 郁金香\" />  </div>";List<String> list = WebpageUtil.getNode(str, "a");System.out.println("-------a标签原内容------");list.forEach(c -> {System.out.println(c);});List<WebNode> list2 = WebpageUtil.getNode2(str, "a");System.out.println("-------解析后的数据------");list2.forEach(c -> {System.out.println("内容:" + c.getContent());System.out.println("标签类型:" + c.getLabelType());System.out.println("标签的参数:" + JSONObject.toJSONString(c.getAttributeHtmlList()));});List<WebNode> list3 = WebpageUtil.getNode2(str, "a");LinkedHashMap<String, String> style = new LinkedHashMap<>();style.put("background-color", "red");for (WebNode webNode : list3) {WebpageUtil.changeStyle(webNode, style);}System.out.println("-------改变样式后(如果原来有style则覆盖)------");list3.forEach(c -> {System.out.println("-----------------------------------");System.out.println("标签html:" + c.getHtmlString());System.out.println("内容:" + c.getContent());System.out.println("标签类型:" + c.getLabelType());System.out.println("标签的参数:" + JSONObject.toJSONString(c.getAttributeHtmlList()));});List<WebNode> list4 = WebpageUtil.getNode2(str, "a");LinkedHashMap<String, String> style2 = new LinkedHashMap<>();style2.put("background-color", "green");for (WebNode webNode : list4) {WebpageUtil.addStyle(webNode, style2);}System.out.println("-------改变样式后(增加样式，不覆盖原来样式，如果是同属性样式，则覆盖，例如原来的bgcolor=red,新增样式bgcolor=green,则替换，否则新增，对其它样式无影响)------");list4.forEach(c -> {System.out.println("-----------------------------------");System.out.println("标签html:" + c.getHtmlString());System.out.println("内容:" + c.getContent());System.out.println("标签类型:" + c.getLabelType());System.out.println("标签的参数:" + JSONObject.toJSONString(c.getAttributeHtmlList()));});}