I am trying to convert a Word document to HTML using Apache POI. I have a Word document that has a horizontal line after a paragraph. The OOXML for the horizontal line looks like this:
<w:p w14:paraId="721E1052" w14:textId="05637367" w:rsidR="002D1248" w:rsidRPr="00BB3E82" w:rsidRDefault="00B3113F" w:rsidP="00797596">
<w:pPr>
<w:rPr>
<w:rFonts w:eastAsia="Times New Roman" w:cs="Courier New"/>
<w:snapToGrid w:val="0"/>
<w:color w:val="000000"/>
<w:lang w:eastAsia="fi-FI"/>
</w:rPr>
</w:pPr>
<w:r>
<w:rPr>
<w:rFonts w:eastAsia="Times New Roman" w:cs="Courier New"/>
<w:snapToGrid w:val="0"/>
<w:color w:val="000000"/>
<w:lang w:eastAsia="fi-FI"/>
</w:rPr>
<w:pict w14:anchorId="534EEFD0">
<v:rect id="_x0000_i1025" style="width:0;height:1.5pt" o:hralign="center" o:hrstd="t" o:hr="t" fillcolor="#a0a0a0" stroked="f"/>
</w:pict>
</w:r>
</w:p>
Corresponding to this horizontal line, I want to add a HR tag in HTML. However, I am not able to retrieve the "rect" element inside "pict". This is what I have tried so far:
List<org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPicture> pics = run.getCTR().getPictList();
if(pics!=null) {
log.debug("Size of pics = "+pics.size());
for (org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPicture pic : pics) {
Node picNode = pic.getDomNode();
CTGroup ctGroup = CTGroup.Factory.parse(picNode);
if(ctGroup!=null) {
log.debug("Size of rects= "+ctGroup.getRectList().size());
}
}
The above code gives: Size of pics = 1 Size of rects= 0 I am not sure why that is the case. Any help in understanding how to retrieve the "rect" object would be really appreciated. Thank you.
You cannot parse a com.microsoft.schemas.vml.CTGroup
element from a org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPicture
dom node.
But all ooxml-schemas
objects inherit from org.apache.xmlbeans.XmlObject
. So they can select children by element URI and element local name using XmlObject.selectChildren. What we need to know is that the name space URI for com.microsoft.schemas.vml.*
is "urn:schemas-microsoft-com:vml".
Example:
import java.io.FileInputStream;
import org.apache.poi.xwpf.usermodel.*;
import org.apache.xmlbeans.XmlObject;
import java.util.List;
public class WordReadCTPictureContent {
public static void main(String[] args) throws Exception {
String inFilePath = "./HRBetweenParagraphs.docx";
XWPFDocument document = new XWPFDocument(new FileInputStream(inFilePath));
for (XWPFParagraph paragraph : document.getParagraphs()) {
for (XWPFRun run : paragraph.getRuns()) {
List<org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPicture> pics = run.getCTR().getPictList();
System.out.println("Size of pics = " + pics.size());
for (org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPicture pic : pics) {
//select com.microsoft.schemas.vml.CTRect children by elementUri and elementLocalName
XmlObject[] rects = pic.selectChildren("urn:schemas-microsoft-com:vml", "rect");
System.out.println("Count of rects = " + rects.length);
for (XmlObject obj : rects) {
com.microsoft.schemas.vml.CTRect rect = (com.microsoft.schemas.vml.CTRect)obj;
//now we can work with found com.microsoft.schemas.vml.CTRect
System.out.println("Id of found rect = " + rect.getId());
}
}
}
}
document.close();
}
}