Index: xword/ContentFiltering/Office/Word/LocalToWebHTML.cs =================================================================== --- xword/ContentFiltering/Office/Word/LocalToWebHTML.cs (revision 20809) +++ xword/ContentFiltering/Office/Word/LocalToWebHTML.cs (working copy) @@ -54,12 +54,171 @@ AdaptLists(ref xmlDoc); AdaptMacros(ref xmlDoc); ClearOfficeAttributes(ref xmlDoc); + FixEmptyNestedElements(ref xmlDoc); StringBuilder sb = new StringBuilder(xmlDoc.GetIndentedXml()); sb.Replace(" xmlns=\"\"",""); return sb.ToString(); } /// + /// Cleans up the XmlDocument by removing nested and empty elements. + /// + /// A reference to the XmlDocument + private void FixEmptyNestedElements(ref XmlDocument xmlDoc) + { + XmlNode body = xmlDoc.GetElementsByTagName("body")[0]; + XmlNodeList bodyChildNodes = body.ChildNodes; + + Dictionary> nestedNodesToProtect = new Dictionary>(); + nestedNodesToProtect.Add("b", new List() { "u","i","span"}); + nestedNodesToProtect.Add("u", new List() { "b", "i","span"}); + nestedNodesToProtect.Add("i", new List() { "b", "u","span"}); + nestedNodesToProtect.Add("span", new List() { "b", "u", "i"}); + nestedNodesToProtect.Add("p", new List() {"span","b","u","i" }); + nestedNodesToProtect.Add("h1", new List() { "span", "p", "div", "b", "u", "i" }); + nestedNodesToProtect.Add("h2", new List() { "span", "p", "div", "b", "u", "i" }); + nestedNodesToProtect.Add("h3", new List() { "span", "p", "div", "b", "u", "i" }); + nestedNodesToProtect.Add("ol", new List() { "li","ol","ul" }); + nestedNodesToProtect.Add("ul", new List() { "li", "ol", "ul" }); + nestedNodesToProtect.Add("table",new List(){"tr","td","thead","tbody"}); + nestedNodesToProtect.Add("tr",new List(){"td"}); + nestedNodesToProtect.Add("td", new List() { "p", "span", "div", "b", "u", "i", "ol", "ul" }); + + foreach (XmlNode node in bodyChildNodes) + { + RemoveNestedElements(ref xmlDoc, node,nestedNodesToProtect); + + } + + List emptyNodesToProtect = new List() { "br", "img", "hr" }; + + + RemoveEmptyElements(ref xmlDoc, emptyNodesToProtect); + } + + + /// + /// Removes nested empty elements from the specified node and it's children, if node and children + /// are not in the rules from nodesToProtect Dictionary. + /// + /// A reference to an XmlDocument + /// Node to process. + /// Nodes not to alter. + private void RemoveNestedElements(ref XmlDocument xmlDoc,XmlNode nodeToProcess,Dictionary> nodesToProtect) + { + if (nodeToProcess.NodeType == XmlNodeType.Element + && nodeToProcess.ChildNodes.Count > 0) + { + + XmlNodeList childNodes = nodeToProcess.ChildNodes; + foreach (XmlNode child in childNodes) + { + RemoveNestedElements(ref xmlDoc, child,nodesToProtect); + } + + if (nodeToProcess.FirstChild.NodeType == XmlNodeType.Element + && nodeToProcess.Value == null) + { + XmlNode parentNode = nodeToProcess.ParentNode; + XmlNode afterNode = nodeToProcess; + List nodesToMove = new List(); + bool readyForRemoval=true; + + foreach (string protectedParent in nodesToProtect.Keys) + { + if (nodeToProcess.Name == protectedParent.Trim().ToLower() && readyForRemoval) + { + foreach (XmlNode child in childNodes) + { + foreach (string protectedChild in nodesToProtect[protectedParent]) + { + if (protectedChild.Trim().ToLower() == child.Name) + { + readyForRemoval = false; + break; + } + } + if (!readyForRemoval) + { + break; + } + } + } + if (!readyForRemoval) + { + break; + } + } + + if (readyForRemoval) + { + foreach (XmlNode child in childNodes) + { + nodesToMove.Add(child); + } + foreach (XmlNode nodeToMove in nodesToMove) + { + afterNode = parentNode.InsertAfter(nodeToProcess.RemoveChild(nodeToMove), afterNode); + } + + parentNode.RemoveChild(nodeToProcess); + } + } + } + + + } + + /// + /// Removes empty elements if not in nodesToProtect list. + /// An element is considered to be empty if it has no children and no innerText. + /// Empty paragraphs are transformed to breaks. + /// + /// A reference to the XmlDocument + /// A list of strings containing nodes to protect from erasing(like br, img, hr). + private void RemoveEmptyElements(ref XmlDocument xmlDoc, List nodesToProtect) + { + XmlNodeList nodes = xmlDoc.GetElementsByTagName("*"); + List nodesToDelete = new List(); + List paragraphsToAlter = new List(); + foreach (XmlNode node in nodes) + { + if (node.NodeType == XmlNodeType.Element + && node.Value==null + && node.ChildNodes.Count==0) + { + bool preventDelete=false; + foreach (string nodeName in nodesToProtect) + { + if (nodeName.ToLower().Trim() == node.Name) + { + preventDelete = true; + break; + } + } + + if (node.Name == "p") + { + preventDelete = true; + paragraphsToAlter.Add(node); + } + if (!preventDelete) + { + nodesToDelete.Add(node); + } + } + } + foreach (XmlNode node in nodesToDelete) + { + node.ParentNode.RemoveChild(node); + } + foreach (XmlNode node in paragraphsToAlter) + { + node.ParentNode.ReplaceChild(xmlDoc.CreateElement("br"), node); + } + } + + /// /// Removes 'class' attribute from text marked as containing grammar or spelling errors. /// (when values are 'gramE' or 'spellE'). Removes 'lang' attribute. Adds a space character /// (' ') to the affected text, to make sure words marked as errors are separated. @@ -126,7 +285,9 @@ xIterator = navigator.Select(expression); foreach (XPathNavigator nav in xIterator) { - if (nav.Value == "MsoNormal" || nav.Value == "MsoNormalTable" || nav.Value == "MsoTableGrid") + if (nav.Value == "MsoNormal" || nav.Value == "MsoNormalTable" + || nav.Value == "MsoTableGrid" || nav.Value=="MsoSubtitle" + || nav.Value == "MsoTitle") { nav.DeleteSelf(); }