java中无效字符串,在java中从字符串中删除无效的XML字符

到目前为止,所有这些答案只会取代字符本身。但有时XML文档会有无效的XML实体序列,从而导致错误。例如,如果

Illegal character entity: expansion character (code 0x2 at ...

.

这里有一个简单的java程序,可以替换那些无效的实体序列。

public final Pattern XML_ENTITY_PATTERN = Pattern.compile("\\&\\#(?:x([0-9a-fA-F]+)|([0-9]+))\\;");

/**

* Remove problematic xml entities from the xml string so that you can parse it with java DOM / SAX libraries.

*/

String getCleanedXml(String xmlString) {

Matcher m = XML_ENTITY_PATTERN.matcher(xmlString);

Set replaceSet = new HashSet<>();

while (m.find()) {

String group = m.group(1);

int val;

if (group != null) {

val = Integer.parseInt(group, 16);

if (isInvalidXmlChar(val)) {

replaceSet.add("" + group + ";");

}

} else if ((group = m.group(2)) != null) {

val = Integer.parseInt(group);

if (isInvalidXmlChar(val)) {

replaceSet.add("" + group + ";");

}

}

}

String cleanedXmlString = xmlString;

for (String replacer : replaceSet) {

cleanedXmlString = cleanedXmlString.replaceAll(replacer, "");

}

return cleanedXmlString;

}

private boolean isInvalidXmlChar(int val) {

if (val == 0x9 || val == 0xA || val == 0xD ||

val >= 0x20 && val <= 0xD7FF ||

val >= 0x10000 && val <= 0x10FFFF) {

return false;

}

return true;

}