【转】java截取带html标签的字符串,再把标签补全(保证页面显示效果)

【转】http://blog.csdn.net/zdtwyjp/article/details/5736430
Java截取带HTML标签的字符串,一般情况下有三种实现方式：
1、在截取字符串函数中对HTML标签进行闭合（对标签作入栈出栈式处理即可）。
2、过滤掉HTML。
3、如果需要保留样式的话，可以按照格式提取各节内容，然后分别截取之后再重新用HTML做出表现样式。
1和2的方法不够好。因为1在截取长度时，是边所有的HTML标签都计算在内的，如果有的内容包含HTML标签较多，哪它实际的正文内容就得少了。所以不同内容可能表现长度不一，视觉效果不好。而2，没有格式，当然不可取。所以相对来说， 3比较好些。先提取内容然后再将内容补全，具体实现代码如下：
[java]view plain copy
       
   
package string;  
public class TagsList {  
    private String[] data;  
    private int size = 0;  
    public TagsList(int size) {  
        data = new String[size];  
    }  
    public TagsList() {  
        this(10);  
    }  
    public void add(String str) {  
        ensureCapacity(size + 1);  
        data[size++] = str;  
    }  
    public String get(int index) {  
        if(index < size)  
            return data[index];  
        else  
            return null;  
    }  
    // 为了提高效率，只将其置为null  
    public boolean remove(String str) {  
        for(int index = 0; index < size; index++) {  
            if(str.equals(data[index])) {  
                data[index] = null;  
                return true;  
            }  
        }  
        return false;  
    }  
    public boolean remove(int index) {  
        if(index < data.length) {  
            data[index] = null;  
            return true;  
        }  
        return false;  
    }  
    public int size() {  
        return this.size;  
    }  
    // 扩展容量  
    public void ensureCapacity(int minSize) {  
        int oldCapacity = data.length;  
        if(minSize > oldCapacity) {  
            int newCapacity = (oldCapacity * 3 / 2 + 1) > minSize ? oldCapacity * 3 / 2 + 1 : minSize;  
            String[] newArray = new String[newCapacity];  
            for(int i = 0; i < data.length; i++) {  
                newArray[i] = data[i];  
            }  
            data = newArray;  
        }  
    }  
}  
  
package string;  
  
/** 
 * java截取带html标签的字符串,再把标签补全(保证页面显示效果)<br> 
 * 一般是用在字符串中有html标签的截取.如: 后台发布用了在线编辑器, 前台显示内容要截取的情况.<br> 
 *  
 * @author YangJunping 
 * @date 2010-7-15 
 */  
public class SubStringHTML {  
    public static void main(String[] args) {  
        String htmlCode = "<h1><span style="/" mce_style="/""font-size: xx-large; color: #000000;/">新华网北京7月13,.</span></h1><h1><span>北京7——月13</span></h1>";  
        System.out.println(subStringHTML(htmlCode, 5));  
        // 测试结果：<h1><span style="font-size: xx-large; color: #000000;" mce_style="font-size: xx-large; color: #000000;">新华网</span></h1>  
    }  
    /** 
     * 按子节长度截取字符串(支持截取带HTML代码样式的字符串)<br> 
     * 如：<span>中国人发在线</span> 当截取2个字节得到的结果是：<span>中国 
     *  
     * @param param 
     *            将要截取的含html代码的字符串参数 
     * @param length 
     *            截取的字节长度 
     * @return 返回截取后的字符串 
     * @author YangJunping 
     * @date 2010-7-15 
     */  
    public static String subStringHTML(String param, int length) {  
        StringBuffer result = new StringBuffer();  
        int n = 0;  
        char temp;  
        boolean isCode = false; // 是不是HTML代码  
        boolean isHTML = false; // 是不是HTML特殊字符,如   
        for(int i = 0; i < param.length(); i++) {  
            temp = param.charAt(i);  
            if(temp == '<') {  
                isCode = true;  
            }else if(temp == '&') {  
                isHTML = true;  
            }else if(temp == '>' && isCode) {  
                n = n - 1;  
                isCode = false;  
            }else if(temp == ';' && isHTML) {  
                isHTML = false;  
            }  
            if(!isCode && !isHTML) {  
                n = n + 1;  
                // UNICODE码字符占两个字节  
                if((temp + "").getBytes().length > 1) {  
                    n = n + 1;  
                }  
            }  
            result.append(temp);  
            if(n >= length) {  
                break;  
            }  
        }  
        return fix(result.toString());  
    }  
    /** 
     * 补全HTML代码<br> 
     * 如：<span>中国 ---> <span>中国</span> 
     *  
     * @param str 
     * @return 
     * @author YangJunping 
     * @date 2010-7-15 
     */  
    private static String fix(String str) {  
        StringBuffer fixed = new StringBuffer(); // 存放修复后的字符串  
        TagsList[] unclosedTags = getUnclosedTags(str);  
        // 生成新字符串  
        for(int i = unclosedTags[0].size() - 1; i > -1; i--) {  
            fixed.append("<" + unclosedTags[0].get(i) + ">");  
        }  
        fixed.append(str);  
        for(int i = unclosedTags[1].size() - 1; i > -1; i--) {  
            String s = null;  
            if((s = unclosedTags[1].get(i)) != null) {  
                fixed.append("</" + s + ">");  
            }  
        }  
        return fixed.toString();  
    }  
    private static TagsList[] getUnclosedTags(String str) {  
        StringBuffer temp = new StringBuffer(); // 存放标签  
        TagsList[] unclosedTags = new TagsList[2];  
        unclosedTags[0] = new TagsList(); // 前不闭合，如有</div>而前面没有<div>  
        unclosedTags[1] = new TagsList(); // 后不闭合，如有<div>而后面没有</div>  
        boolean flag = false; // 记录双引号"或单引号'  
        char currentJump = ' '; // 记录需要跳过''还是""  
        char current = ' ', last = ' '; // 当前 & 上一个  
        // 开始判断  
        for(int i = 0; i < str.length();) {  
            current = str.charAt(i++); // 读取一个字符  
            if(current == '"' || current == '/'') {  
                flag = flag ? false : true; // 若为引号，flag翻转  
                currentJump = current;  
            }  
            if(!flag) {  
                if(current == '<') { // 开始提取标签  
                    current = str.charAt(i++);  
                    if(current == '/') { // 标签的闭合部分，如</div>  
                        current = str.charAt(i++);  
                        // 读取标签  
                        while(i < str.length() && current != '>') {  
                            temp.append(current);  
                            current = str.charAt(i++);  
                        }  
                        // 从tags_bottom移除一个闭合的标签  
                        if(!unclosedTags[1].remove(temp.toString())) { // 若移除失败，说明前面没有需要闭合的标签  
                            unclosedTags[0].add(temp.toString()); // 此标签需要前闭合  
                        }  
                        temp.delete(0, temp.length()); // 清空temp  
                    }else { // 标签的前部分，如<div>  
                        last = current;  
                        while(i < str.length() && current != ' ' && current != ' ' && current != '>') {  
                            temp.append(current);  
                            last = current;  
                            current = str.charAt(i++);  
                        }  
                        // 已经读取到标签，跳过其他内容，如<div id=test>跳过id=test  
                        while(i < str.length() && current != '>') {  
                            last = current;  
                            current = str.charAt(i++);  
                            if(current == '"' || current == '/'') { // 判断引号  
                                flag = flag ? false : true;  
                                currentJump = current;  
                                if(flag) { // 若引号不闭合，跳过到下一个引号之间的内容  
                                    while(i < str.length() && str.charAt(i++) != currentJump)  
                                        ;  
                                    current = str.charAt(i++);  
                                    flag = false;  
                                }  
                            }  
                        }  
                        if(last != '/' && current == '>') // 判断这种类型：<TagName />  
                            unclosedTags[1].add(temp.toString());  
                        temp.delete(0, temp.length());  
                    }  
                }  
            }else {  
                while(i < str.length() && str.charAt(i++) != currentJump)  
                    ; // 跳过引号之间的部分  
                flag = false;  
            }  
        }  
        return unclosedTags;  
    }  
}  
【转】java截取带html标签的字符串,再把标签补全(保证页面显示效果)

猜你喜欢