exports.createBodyReader = createBodyReader; exports._readNumberingProperties = readNumberingProperties; var dingbatToUnicode = require("dingbat-to-unicode"); var _ = require("underscore"); var documents = require("../documents"); var Result = require("../results").Result; var warning = require("../results").warning; var uris = require("./uris"); function createBodyReader(options) { return { readXmlElement: function(element) { return new BodyReader(options).readXmlElement(element); }, readXmlElements: function(elements) { return new BodyReader(options).readXmlElements(elements); } }; } function BodyReader(options) { var complexFieldStack = []; var currentInstrText = []; var relationships = options.relationships; var contentTypes = options.contentTypes; var docxFile = options.docxFile; var files = options.files; var numbering = options.numbering; var styles = options.styles; function readXmlElements(elements) { var results = elements.map(readXmlElement); return combineResults(results); } function readXmlElement(element) { if (element.type === "element") { var handler = xmlElementReaders[element.name]; if (handler) { return handler(element); } else if (!Object.prototype.hasOwnProperty.call(ignoreElements, element.name)) { var message = warning("An unrecognised element was ignored: " + element.name); return emptyResultWithMessages([message]); } } return emptyResult(); } function readParagraphIndent(element) { return { start: element.attributes["w:start"] || element.attributes["w:left"], end: element.attributes["w:end"] || element.attributes["w:right"], firstLine: element.attributes["w:firstLine"], hanging: element.attributes["w:hanging"] }; } function readRunProperties(element) { return readRunStyle(element).map(function(style) { var fontSizeString = element.firstOrEmpty("w:sz").attributes["w:val"]; // w:sz gives the font size in half points, so halve the value to get the size in points var fontSize = /^[0-9]+$/.test(fontSizeString) ? parseInt(fontSizeString, 10) / 2 : null; return { type: "runProperties", styleId: style.styleId, styleName: style.name, verticalAlignment: element.firstOrEmpty("w:vertAlign").attributes["w:val"], font: element.firstOrEmpty("w:rFonts").attributes["w:ascii"], fontSize: fontSize, isBold: readBooleanElement(element.first("w:b")), isUnderline: readUnderline(element.first("w:u")), isItalic: readBooleanElement(element.first("w:i")), isStrikethrough: readBooleanElement(element.first("w:strike")), isAllCaps: readBooleanElement(element.first("w:caps")), isSmallCaps: readBooleanElement(element.first("w:smallCaps")) }; }); } function readUnderline(element) { if (element) { var value = element.attributes["w:val"]; return value !== undefined && value !== "false" && value !== "0" && value !== "none"; } else { return false; } } function readBooleanElement(element) { if (element) { var value = element.attributes["w:val"]; return value !== "false" && value !== "0"; } else { return false; } } function readParagraphStyle(element) { return readStyle(element, "w:pStyle", "Paragraph", styles.findParagraphStyleById); } function readRunStyle(element) { return readStyle(element, "w:rStyle", "Run", styles.findCharacterStyleById); } function readTableStyle(element) { return readStyle(element, "w:tblStyle", "Table", styles.findTableStyleById); } function readStyle(element, styleTagName, styleType, findStyleById) { var messages = []; var styleElement = element.first(styleTagName); var styleId = null; var name = null; if (styleElement) { styleId = styleElement.attributes["w:val"]; if (styleId) { var style = findStyleById(styleId); if (style) { name = style.name; } else { messages.push(undefinedStyleWarning(styleType, styleId)); } } } return elementResultWithMessages({styleId: styleId, name: name}, messages); } var unknownComplexField = {type: "unknown"}; function readFldChar(element) { var type = element.attributes["w:fldCharType"]; if (type === "begin") { complexFieldStack.push(unknownComplexField); currentInstrText = []; } else if (type === "end") { complexFieldStack.pop(); } else if (type === "separate") { var hyperlinkOptions = parseHyperlinkFieldCode(currentInstrText.join('')); var complexField = hyperlinkOptions === null ? unknownComplexField : {type: "hyperlink", options: hyperlinkOptions}; complexFieldStack.pop(); complexFieldStack.push(complexField); } return emptyResult(); } function currentHyperlinkOptions() { var topHyperlink = _.last(complexFieldStack.filter(function(complexField) { return complexField.type === "hyperlink"; })); return topHyperlink ? topHyperlink.options : null; } function parseHyperlinkFieldCode(code) { var externalLinkResult = /\s*HYPERLINK "(.*)"/.exec(code); if (externalLinkResult) { return {href: externalLinkResult[1]}; } var internalLinkResult = /\s*HYPERLINK\s+\\l\s+"(.*)"/.exec(code); if (internalLinkResult) { return {anchor: internalLinkResult[1]}; } return null; } function readInstrText(element) { currentInstrText.push(element.text()); return emptyResult(); } function readSymbol(element) { // See 17.3.3.30 sym (Symbol Character) of ECMA-376 4th edition Part 1 var font = element.attributes["w:font"]; var char = element.attributes["w:char"]; var unicodeCharacter = dingbatToUnicode.hex(font, char); if (unicodeCharacter == null && /^F0..$/.test(char)) { unicodeCharacter = dingbatToUnicode.hex(font, char.substring(2)); } if (unicodeCharacter == null) { return emptyResultWithMessages([warning( "A w:sym element with an unsupported character was ignored: char " + char + " in font " + font )]); } else { return elementResult(new documents.Text(unicodeCharacter.string)); } } function noteReferenceReader(noteType) { return function(element) { var noteId = element.attributes["w:id"]; return elementResult(new documents.NoteReference({ noteType: noteType, noteId: noteId })); }; } function readCommentReference(element) { return elementResult(documents.commentReference({ commentId: element.attributes["w:id"] })); } function readChildElements(element) { return readXmlElements(element.children); } var xmlElementReaders = { "w:p": function(element) { return readXmlElements(element.children) .map(function(children) { var properties = _.find(children, isParagraphProperties); return new documents.Paragraph( children.filter(negate(isParagraphProperties)), properties ); }) .insertExtra(); }, "w:pPr": function(element) { return readParagraphStyle(element).map(function(style) { return { type: "paragraphProperties", styleId: style.styleId, styleName: style.name, alignment: element.firstOrEmpty("w:jc").attributes["w:val"], numbering: readNumberingProperties(style.styleId, element.firstOrEmpty("w:numPr"), numbering), indent: readParagraphIndent(element.firstOrEmpty("w:ind")) }; }); }, "w:r": function(element) { return readXmlElements(element.children) .map(function(children) { var properties = _.find(children, isRunProperties); children = children.filter(negate(isRunProperties)); var hyperlinkOptions = currentHyperlinkOptions(); if (hyperlinkOptions !== null) { children = [new documents.Hyperlink(children, hyperlinkOptions)]; } return new documents.Run(children, properties); }); }, "w:rPr": readRunProperties, "w:fldChar": readFldChar, "w:instrText": readInstrText, "w:t": function(element) { return elementResult(new documents.Text(element.text())); }, "w:tab": function(element) { return elementResult(new documents.Tab()); }, "w:noBreakHyphen": function() { return elementResult(new documents.Text("\u2011")); }, "w:softHyphen": function(element) { return elementResult(new documents.Text("\u00AD")); }, "w:sym": readSymbol, "w:hyperlink": function(element) { var relationshipId = element.attributes["r:id"]; var anchor = element.attributes["w:anchor"]; return readXmlElements(element.children).map(function(children) { function create(options) { var targetFrame = element.attributes["w:tgtFrame"] || null; return new documents.Hyperlink( children, _.extend({targetFrame: targetFrame}, options) ); } if (relationshipId) { var href = relationships.findTargetByRelationshipId(relationshipId); if (anchor) { href = uris.replaceFragment(href, anchor); } return create({href: href}); } else if (anchor) { return create({anchor: anchor}); } else { return children; } }); }, "w:tbl": readTable, "w:tr": readTableRow, "w:tc": readTableCell, "w:footnoteReference": noteReferenceReader("footnote"), "w:endnoteReference": noteReferenceReader("endnote"), "w:commentReference": readCommentReference, "w:br": function(element) { var breakType = element.attributes["w:type"]; if (breakType == null || breakType === "textWrapping") { return elementResult(documents.lineBreak); } else if (breakType === "page") { return elementResult(documents.pageBreak); } else if (breakType === "column") { return elementResult(documents.columnBreak); } else { return emptyResultWithMessages([warning("Unsupported break type: " + breakType)]); } }, "w:bookmarkStart": function(element){ var name = element.attributes["w:name"]; if (name === "_GoBack") { return emptyResult(); } else { return elementResult(new documents.BookmarkStart({name: name})); } }, "mc:AlternateContent": function(element) { return readChildElements(element.first("mc:Fallback")); }, "w:sdt": function(element) { return readXmlElements(element.firstOrEmpty("w:sdtContent").children); }, "w:ins": readChildElements, "w:object": readChildElements, "w:smartTag": readChildElements, "w:drawing": readChildElements, "w:pict": function(element) { return readChildElements(element).toExtra(); }, "v:roundrect": readChildElements, "v:shape": readChildElements, "v:textbox": readChildElements, "w:txbxContent": readChildElements, "wp:inline": readDrawingElement, "wp:anchor": readDrawingElement, "v:imagedata": readImageData, "v:group": readChildElements, "v:rect": readChildElements }; return { readXmlElement: readXmlElement, readXmlElements: readXmlElements }; function readTable(element) { var propertiesResult = readTableProperties(element.firstOrEmpty("w:tblPr")); return readXmlElements(element.children) .flatMap(calculateRowSpans) .flatMap(function(children) { return propertiesResult.map(function(properties) { return documents.Table(children, properties); }); }); } function readTableProperties(element) { return readTableStyle(element).map(function(style) { return { styleId: style.styleId, styleName: style.name }; }); } function readTableRow(element) { var properties = element.firstOrEmpty("w:trPr"); var isHeader = !!properties.first("w:tblHeader"); return readXmlElements(element.children).map(function(children) { return documents.TableRow(children, {isHeader: isHeader}); }); } function readTableCell(element) { return readXmlElements(element.children).map(function(children) { var properties = element.firstOrEmpty("w:tcPr"); var gridSpan = properties.firstOrEmpty("w:gridSpan").attributes["w:val"]; var colSpan = gridSpan ? parseInt(gridSpan, 10) : 1; var cell = documents.TableCell(children, {colSpan: colSpan}); cell._vMerge = readVMerge(properties); return cell; }); } function readVMerge(properties) { var element = properties.first("w:vMerge"); if (element) { var val = element.attributes["w:val"]; return val === "continue" || !val; } else { return null; } } function calculateRowSpans(rows) { var unexpectedNonRows = _.any(rows, function(row) { return row.type !== documents.types.tableRow; }); if (unexpectedNonRows) { return elementResultWithMessages(rows, [warning( "unexpected non-row element in table, cell merging may be incorrect" )]); } var unexpectedNonCells = _.any(rows, function(row) { return _.any(row.children, function(cell) { return cell.type !== documents.types.tableCell; }); }); if (unexpectedNonCells) { return elementResultWithMessages(rows, [warning( "unexpected non-cell element in table row, cell merging may be incorrect" )]); } var columns = {}; rows.forEach(function(row) { var cellIndex = 0; row.children.forEach(function(cell) { if (cell._vMerge && columns[cellIndex]) { columns[cellIndex].rowSpan++; } else { columns[cellIndex] = cell; cell._vMerge = false; } cellIndex += cell.colSpan; }); }); rows.forEach(function(row) { row.children = row.children.filter(function(cell) { return !cell._vMerge; }); row.children.forEach(function(cell) { delete cell._vMerge; }); }); return elementResult(rows); } function readDrawingElement(element) { var blips = element .getElementsByTagName("a:graphic") .getElementsByTagName("a:graphicData") .getElementsByTagName("pic:pic") .getElementsByTagName("pic:blipFill") .getElementsByTagName("a:blip"); return combineResults(blips.map(readBlip.bind(null, element))); } function readBlip(element, blip) { var properties = element.first("wp:docPr").attributes; var altText = isBlank(properties.descr) ? properties.title : properties.descr; var blipImageFile = findBlipImageFile(blip); if (blipImageFile === null) { return emptyResultWithMessages([warning("Could not find image file for a:blip element")]); } else { return readImage(blipImageFile, altText); } } function isBlank(value) { return value == null || /^\s*$/.test(value); } function findBlipImageFile(blip) { var embedRelationshipId = blip.attributes["r:embed"]; var linkRelationshipId = blip.attributes["r:link"]; if (embedRelationshipId) { return findEmbeddedImageFile(embedRelationshipId); } else if (linkRelationshipId) { var imagePath = relationships.findTargetByRelationshipId(linkRelationshipId); return { path: imagePath, read: files.read.bind(files, imagePath) }; } else { return null; } } function readImageData(element) { var relationshipId = element.attributes['r:id']; if (relationshipId) { return readImage( findEmbeddedImageFile(relationshipId), element.attributes["o:title"]); } else { return emptyResultWithMessages([warning("A v:imagedata element without a relationship ID was ignored")]); } } function findEmbeddedImageFile(relationshipId) { var path = uris.uriToZipEntryName("word", relationships.findTargetByRelationshipId(relationshipId)); return { path: path, read: docxFile.read.bind(docxFile, path) }; } function readImage(imageFile, altText) { var contentType = contentTypes.findContentType(imageFile.path); var image = documents.Image({ readImage: imageFile.read, altText: altText, contentType: contentType }); var warnings = supportedImageTypes[contentType] ? [] : warning("Image of type " + contentType + " is unlikely to display in web browsers"); return elementResultWithMessages(image, warnings); } function undefinedStyleWarning(type, styleId) { return warning( type + " style with ID " + styleId + " was referenced but not defined in the document"); } } function readNumberingProperties(styleId, element, numbering) { if (styleId != null) { var levelByStyleId = numbering.findLevelByParagraphStyleId(styleId); if (levelByStyleId != null) { return levelByStyleId; } } var level = element.firstOrEmpty("w:ilvl").attributes["w:val"]; var numId = element.firstOrEmpty("w:numId").attributes["w:val"]; if (level === undefined || numId === undefined) { return null; } else { return numbering.findLevel(numId, level); } } var supportedImageTypes = { "image/png": true, "image/gif": true, "image/jpeg": true, "image/svg+xml": true, "image/tiff": true }; var ignoreElements = { "office-word:wrap": true, "v:shadow": true, "v:shapetype": true, "w:annotationRef": true, "w:bookmarkEnd": true, "w:sectPr": true, "w:proofErr": true, "w:lastRenderedPageBreak": true, "w:commentRangeStart": true, "w:commentRangeEnd": true, "w:del": true, "w:footnoteRef": true, "w:endnoteRef": true, "w:tblPr": true, "w:tblGrid": true, "w:trPr": true, "w:tcPr": true }; function isParagraphProperties(element) { return element.type === "paragraphProperties"; } function isRunProperties(element) { return element.type === "runProperties"; } function negate(predicate) { return function(value) { return !predicate(value); }; } function emptyResultWithMessages(messages) { return new ReadResult(null, null, messages); } function emptyResult() { return new ReadResult(null); } function elementResult(element) { return new ReadResult(element); } function elementResultWithMessages(element, messages) { return new ReadResult(element, null, messages); } function ReadResult(element, extra, messages) { this.value = element || []; this.extra = extra; this._result = new Result({ element: this.value, extra: extra }, messages); this.messages = this._result.messages; } ReadResult.prototype.toExtra = function() { return new ReadResult(null, joinElements(this.extra, this.value), this.messages); }; ReadResult.prototype.insertExtra = function() { var extra = this.extra; if (extra && extra.length) { return new ReadResult(joinElements(this.value, extra), null, this.messages); } else { return this; } }; ReadResult.prototype.map = function(func) { var result = this._result.map(function(value) { return func(value.element); }); return new ReadResult(result.value, this.extra, result.messages); }; ReadResult.prototype.flatMap = function(func) { var result = this._result.flatMap(function(value) { return func(value.element)._result; }); return new ReadResult(result.value.element, joinElements(this.extra, result.value.extra), result.messages); }; function combineResults(results) { var result = Result.combine(_.pluck(results, "_result")); return new ReadResult( _.flatten(_.pluck(result.value, "element")), _.filter(_.flatten(_.pluck(result.value, "extra")), identity), result.messages ); } function joinElements(first, second) { return _.flatten([first, second]); } function identity(value) { return value; }