Index: src/main/java/org/jboss/dna/sequencer/msoffice/MSOfficeMetadataSequencer.java =================================================================== --- src/main/java/org/jboss/dna/sequencer/msoffice/MSOfficeMetadataSequencer.java (revision 727) +++ src/main/java/org/jboss/dna/sequencer/msoffice/MSOfficeMetadataSequencer.java (working copy) @@ -25,6 +25,7 @@ import java.io.IOException; import java.io.InputStream; +import java.util.Iterator; import java.util.List; import org.jboss.dna.graph.sequencer.SequencerContext; import org.jboss.dna.graph.sequencer.SequencerOutput; @@ -33,6 +34,7 @@ import org.jboss.dna.sequencer.msoffice.excel.ExcelMetadataReader; import org.jboss.dna.sequencer.msoffice.powerpoint.PowerPointMetadataReader; import org.jboss.dna.sequencer.msoffice.powerpoint.SlideMetadata; +import org.jboss.dna.sequencer.msoffice.word.WordMetadata; import org.jboss.dna.sequencer.msoffice.word.WordMetadataReader; /** @@ -110,6 +112,11 @@ // Excel specific public static final String EXCEL_FULL_CONTENT = "msoffice:full_contents"; public static final String EXCEL_SHEET_NAME = "msoffice:sheet_name"; + + // Word specific + public static final String WORD_HEADING_NODE = "msoffice:heading"; + public static final String WORD_HEADING_NAME = "msoffice:heading_name"; + public static final String WORD_HEADING_LEVEL = "msoffice:heading_level"; /** * {@inheritDoc} @@ -167,7 +174,16 @@ if (mimeType.equals("application/vnd.ms-word")) { // Sometime in the future this will sequence WORD Table of contents. try { - /*WordMetadata wordMetadata =*/WordMetadataReader.invoke(stream); + WordMetadata wordMetadata = WordMetadataReader.instance(stream); + + for (Iterator iter = wordMetadata.getHeadings().iterator(); iter.hasNext(); ) { + WordMetadata.WordHeading heading = iter.next(); + + output.setProperty(METADATA_NODE + "/" + WORD_HEADING_NODE, WORD_HEADING_NAME, heading.getText()); + output.setProperty(METADATA_NODE + "/" + WORD_HEADING_NODE, WORD_HEADING_LEVEL, heading.getHeaderLevel()); + + } + } catch (IOException e) { // There was an error reading, so log and continue ... context.getLogger(this.getClass()).debug(e, "Error while extracting the Word document metadata"); Index: src/main/java/org/jboss/dna/sequencer/msoffice/word/WordMetadata.java =================================================================== --- src/main/java/org/jboss/dna/sequencer/msoffice/word/WordMetadata.java (revision 727) +++ src/main/java/org/jboss/dna/sequencer/msoffice/word/WordMetadata.java (working copy) @@ -24,10 +24,49 @@ package org.jboss.dna.sequencer.msoffice.word; +import java.util.List; + /** * @author Michael Trezzi */ public class WordMetadata { + private List headings; + public List getHeadings() { + return headings; + } + + public void setHeadings(List headings) { + this.headings = headings; + } + + + public static class WordHeading { + private String text; + private int headingLevel; + + public WordHeading(String text, int headerLevel) { + super(); + this.text = text; + this.headingLevel = headerLevel; + } + + public String getText() { + return text; + } + + public void setText(String text) { + this.text = text; + } + + public int getHeaderLevel() { + return headingLevel; + } + + public void setHeaderLevel(int headerLevel) { + this.headingLevel = headerLevel; + } + + } } Index: src/main/java/org/jboss/dna/sequencer/msoffice/word/WordMetadataReader.java =================================================================== --- src/main/java/org/jboss/dna/sequencer/msoffice/word/WordMetadataReader.java (revision 727) +++ src/main/java/org/jboss/dna/sequencer/msoffice/word/WordMetadataReader.java (working copy) @@ -26,22 +26,65 @@ import java.io.IOException; import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; + import org.apache.poi.hwpf.HWPFDocument; +import org.apache.poi.hwpf.model.StyleSheet; +import org.apache.poi.hwpf.usermodel.Paragraph; +import org.apache.poi.hwpf.usermodel.Range; +import org.jboss.dna.common.util.Logger; /** - * Reades table of contents from Word document + * Infers table of contents from Word document by reading all paragraphs + * with style Heading*. This is analogous to the default + * behavior of Word when generating a table of contents. * * @author Michael Trezzi */ public class WordMetadataReader { - @SuppressWarnings( "null" ) - public static WordMetadata invoke( InputStream stream ) throws IOException { - WordMetadata metadata = new WordMetadata(); + private static final Logger log = Logger.getLogger(WordMetadataReader.class); + + /** Prefix for styles that will be extracted and treated as outline information for the document */ + private static final String HEADER_PREFIX = "Heading"; + + public static WordMetadata instance( InputStream stream ) throws IOException { + WordMetadata metadata = new WordMetadata(); + List headings = new ArrayList(); + HWPFDocument document = new HWPFDocument(stream); - if (document != null) { - // TODO + Range range = document.getRange(); + + StyleSheet stylesheet = document.getStyleSheet(); + + for (int i = 0; i < range.numParagraphs(); i++) { + Paragraph paragraph = range.getParagraph(i); + + String styleName = stylesheet.getStyleDescription(paragraph.getStyleIndex()).getName(); + + if (styleName.startsWith(HEADER_PREFIX)) { + String rawLevelNum = styleName.substring(HEADER_PREFIX.length() + 1).trim(); + int levelNum = 0; + + try { + levelNum = Integer.parseInt(rawLevelNum); + } + catch (NumberFormatException nfe) { + log.debug("Could not parse heading level from: " + styleName); + } + + String text = Paragraph.stripFields(paragraph.text()); + + if ('\r' == text.charAt(text.length() - 1)) { + text = text.substring(0, text.length() - 1); + } + + headings.add(new WordMetadata.WordHeading(text, levelNum)); + } } + + metadata.setHeadings(headings); return metadata; } } Index: src/test/java/org/jboss/dna/sequencer/msoffice/word/WordMetadataTest.java =================================================================== --- src/test/java/org/jboss/dna/sequencer/msoffice/word/WordMetadataTest.java (revision 0) +++ src/test/java/org/jboss/dna/sequencer/msoffice/word/WordMetadataTest.java (revision 0) @@ -0,0 +1,76 @@ +/* + * JBoss DNA (http://www.jboss.org/dna) + * See the COPYRIGHT.txt file distributed with this work for information + * regarding copyright ownership. Some portions may be licensed + * to Red Hat, Inc. under one or more contributor license agreements. + * See the AUTHORS.txt file in the distribution for a full listing of + * individual contributors. + * + * JBoss DNA is free software. Unless otherwise indicated, all code in JBoss DNA + * is licensed to you under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * JBoss DNA is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this software; if not, write to the Free + * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA, or see the FSF site: http://www.fsf.org. + */ + +package org.jboss.dna.sequencer.msoffice.word; + +import static org.hamcrest.core.Is.is; +import org.junit.After; +import static org.junit.Assert.assertThat; +import org.junit.Test; +import java.io.InputStream; +import java.util.List; + +public class WordMetadataTest { + + private static final String[] TEST_HEADERS_TEXT = new String[] { + "Test Heading 1", "Test Heading 1.1", "Test Heading 1.2", "Test Heading 1.2.1", + "Test Heading 2", "Test Heading 2.1", "Test Heading 2.2", + }; + private static final int[] TEST_HEADERS_LEVEL = new int[] { + 1, 2, 2, 3, 1, 2, 2 + }; + + private WordMetadata wordMetadata; + private InputStream imageStream; + + @After + public void afterEach() throws Exception { + if (imageStream != null) { + try { + imageStream.close(); + } finally { + imageStream = null; + } + } + } + + protected InputStream getTestDocument( String resourcePath ) { + return this.getClass().getResourceAsStream("/" + resourcePath); + } + + @Test + public void shouldBeAbleToParseHeadingsForWord() throws Exception { + + wordMetadata = WordMetadataReader.instance(this.getTestDocument("word.doc")); + List headings = wordMetadata.getHeadings(); + + assertThat(headings.size(), is(TEST_HEADERS_TEXT.length)); + + for (int i = 0; i < headings.size(); i++) { + assertThat(headings.get(i).getText(), is(TEST_HEADERS_TEXT[i])); + assertThat(headings.get(i).getHeaderLevel(), is(TEST_HEADERS_LEVEL[i])); + } + + } +}