Index: xword/ContentFiltering/ContentFiltering.csproj =================================================================== --- xword/ContentFiltering/ContentFiltering.csproj (revision 21692) +++ xword/ContentFiltering/ContentFiltering.csproj (working copy) @@ -60,10 +60,24 @@ + + + + + + + + + + + + + + @@ -76,6 +90,18 @@ + + + + + + + + + + + + Index: xword/ContentFiltering/Html/HtmlUtil.cs =================================================================== --- xword/ContentFiltering/Html/HtmlUtil.cs (revision 21692) +++ xword/ContentFiltering/Html/HtmlUtil.cs (working copy) @@ -6,6 +6,7 @@ using System.Text.RegularExpressions; using TidyNet; using TidyNet.Dom; +using ContentFiltering.Office.Word.Cleaners; namespace XWiki.Html { @@ -21,7 +22,7 @@ /// The cleaned html code. public String HtmlToXhtml(String htmlSource) { - return CleanHTML(htmlSource, false); + return new TidyHTMLCleaner(false).Clean(htmlSource); } /// @@ -31,191 +32,11 @@ /// The cleaned html. public String WordHtmlToXhtml(String htmlSource) { - return CleanHTML(htmlSource, true); + return new TidyHTMLCleaner(true).Clean(htmlSource); } - /// - /// Uses Tidy.Net to clean a html source. - /// - /// The original html source. - /// Specifies if the source is an output from Microsoft Word - /// The cleaned Html. - public String CleanHTML(String htmlSource,bool isWordHtml) - { - Tidy tidy = new Tidy(); - //Options required dor xhtml conversion. - tidy.Options.DocType = DocType.Strict; - tidy.Options.DropFontTags = true; - tidy.Options.LogicalEmphasis = true; - tidy.Options.Xhtml = true; - tidy.Options.XmlOut = true; - tidy.Options.MakeClean = true; - tidy.Options.TidyMark = false; - tidy.Options.DropEmptyParas = true; - tidy.Options.IndentContent = true; - tidy.Options.SmartIndent = true; - tidy.Options.Word2000 = isWordHtml; - tidy.Options.EncloseBlockText = true; - - tidy.Options.XmlTags = true; - tidy.Options.FixComments = true; - TidyMessageCollection tmc = new TidyMessageCollection(); - MemoryStream input = new MemoryStream(); - MemoryStream output = new MemoryStream(); - - byte[] byteArray = Encoding.UTF8.GetBytes(htmlSource); - input.Write(byteArray, 0, byteArray.Length); - input.Position = 0; - try - { - tidy.Parse(input, output, tmc); - } - catch (FormatException ex) - { - Log.Exception(ex); - return htmlSource; - } - string cleanContent = Encoding.UTF8.GetString(output.ToArray()); - return cleanContent; - } /// - /// Gets a list with all the tags that contain attributes. - /// - /// The html source. - /// A of strings with the tags containing attributes. - public List GetTagsWithAttributes(String htmlSource) - { - List tags = new List(); - int startIndex = 0; - int endIndex = 0; - do - { - startIndex = htmlSource.IndexOf('<', endIndex); - if (startIndex >= 0) - { - endIndex = htmlSource.IndexOf('>', startIndex); - if (endIndex >= 0) - { - String tag = htmlSource.Substring(startIndex, endIndex - startIndex + 1); - if (tag.Contains('=')) - { - tags.Add(tag); - } - } - } - - } while (startIndex < (htmlSource.Length - 1) && endIndex < (htmlSource.Length - 1) && (startIndex >= 0) && (endIndex >= 0)); - return tags; - } - - /// - /// Corrects the img and br tags generated by Word. - /// - /// The html source to be corrected. - /// The name if the tag. Eg: "img", "br". - /// The corrected html coe. - public String CorrectTagsClosing(String htmlSource, String tagName) - { - //The string builder will be appendend when more then 1000 corrupted tags are found. - int slack = 1000; - string correctionString = " /"; - string searchedString = "<" + tagName; - StringBuilder sb = new StringBuilder(htmlSource.Length + slack); - sb.Insert(0, htmlSource); - int startIndex = 0; - int endIndex = 0; - int nonValidTags = 0; - do - { - startIndex = htmlSource.IndexOf(searchedString, endIndex); - if (startIndex >= 0) - { - endIndex = htmlSource.IndexOf('>', startIndex); - if (endIndex > 0) - { - //The tag is missing the '/' before the '>' character - if (!(htmlSource[endIndex - 1].CompareTo('/') == 0)) - { - sb.Insert(endIndex + nonValidTags * correctionString.Length, correctionString); - nonValidTags++; - } - } - } - } while (startIndex < (htmlSource.Length - 1) && endIndex < (htmlSource.Length - 1) && (startIndex >= 0) && (endIndex >= 0)); - return sb.ToString(); - } - - /// - /// Corrects the attributes that miss ' or ". - /// - /// The original html source code. - /// The source with corrected attributes. - public String CorrectAttributes(String htmlSource) - { - StringBuilder sb = new StringBuilder(htmlSource); - List tags = GetTagsWithAttributes(htmlSource); - foreach(String initialValue in tags) - { - String value = initialValue; - char[] separators = {' ','>','/','\r'}; - bool hasChanged = false; - foreach (String s in initialValue.Split(separators)) - { - String[] attribute = s.Split('='); - if(attribute.Length == 2) - { - try - { - String newValue = attribute[1]; - if (attribute[1][0] != '\'' && attribute[1][0] != '\"') - { - newValue = attribute[0] + "=\"" + attribute[1] + "\""; - value = value.Replace(s, newValue); - hasChanged = true; - } - } - catch (IndexOutOfRangeException) { }; - } - } - if (hasChanged) - { - sb = sb.Replace(initialValue, value); - } - } - return sb.ToString(); - } - - /// - /// Removes the tags that are in the office namespaces. - /// - /// The original content. - /// The cleaned content. - public String RemoveOfficeNameSpacesTags(String content) - { - bool foundTags = false; - int startIndex = 0; - int endIndex = 0; - do - { - foundTags = false; - startIndex = content.IndexOf("= 0) - { - endIndex = content.IndexOf("= 0) - { - endIndex = content.IndexOf(">",endIndex + 1); - content = content.Remove(startIndex, endIndex - startIndex + 1); - } - foundTags = true; - startIndex = endIndex - (endIndex - startIndex + 1); - } - } while (foundTags); - return content; - } - - /// /// Removes a char sequence that starts and ends with the given valaues. /// /// The initial content. @@ -245,25 +66,8 @@ return content; } + /// - /// Gets the content between the opening and closing html tags. - /// - /// The html source to be - /// the inner html of the body. - public String GetBodyContent(String htmlCode) - { - //Delete header & footer - int startIndex, endIndex; - startIndex = htmlCode.IndexOf("", startIndex); - htmlCode = htmlCode.Remove(0, endIndex + 1); - startIndex = htmlCode.IndexOf("= 0) - htmlCode = htmlCode.Remove(startIndex); - return htmlCode; - } - - /// /// Indents the given html source. /// /// The html source. @@ -285,67 +89,8 @@ return htmlSource; } - /// - /// Removes the doctype declaration from an given html code. - /// - /// The original html code. - /// The modified html code. - public String RemoveDoctype(String htmlCode) - { - int startIndex, endIndex; - startIndex = htmlCode.IndexOf("", startIndex); - return htmlCode.Remove(startIndex, endIndex - startIndex); - } /// - /// Gets a string representing the opening html tag with the XML namespace definitions, if any. - /// - /// The html source to be processed - /// a string representing the opening html tag. - public String GetXmlNamespaceDefinitions(String htmlCode) - { - int startIndex, endIndex; - startIndex = htmlCode.IndexOf("", startIndex); - return htmlCode.Substring(startIndex, endIndex - startIndex + 1); - } - } - - /// - /// Replaces the opening html tag with a given one. - /// - /// The html source. - /// The new html tag. - /// - public String ReplaceXmlNamespaceDefinitions(String htmlCode, String newHtmlTag) - { - String oldHtmlTag = GetXmlNamespaceDefinitions(htmlCode); - if (oldHtmlTag == null) - { - if (!htmlCode.Contains(""); - htmlCode = htmlCode.Insert(htmlCode.Length, ""); - } - htmlCode = htmlCode.Insert(0, newHtmlTag); - htmlCode = htmlCode.Insert(htmlCode.Length, ""); - } - else - { - htmlCode = htmlCode.Replace(oldHtmlTag, newHtmlTag); - } - return htmlCode; - } - - /// /// Replaces the body tag with a new given one. /// /// The initial html code. Index: xword/ContentFiltering/Office/Word/Cleaners/BodyContentExtractor.cs =================================================================== --- xword/ContentFiltering/Office/Word/Cleaners/BodyContentExtractor.cs (revision 0) +++ xword/ContentFiltering/Office/Word/Cleaners/BodyContentExtractor.cs (revision 0) @@ -0,0 +1,36 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace ContentFiltering.Office.Word.Cleaners +{ + /// + /// Gets the inner html of the body. + /// + public class BodyContentExtractor : IHTMLCleaner + { + + #region IHTMLCleaner Members + + /// + /// Gets the content between the opening and closing html tags. + /// + /// The html source to be + /// The inner html of the body. + public string Clean(string htmlCode) + { + //Delete header & footer + int startIndex, endIndex; + startIndex = htmlCode.IndexOf("", startIndex); + htmlCode = htmlCode.Remove(0, endIndex + 1); + startIndex = htmlCode.IndexOf("= 0) + htmlCode = htmlCode.Remove(startIndex); + return htmlCode; + } + + #endregion IHTMLCleaner Members + } +} Index: xword/ContentFiltering/Office/Word/Cleaners/CommentsRemover.cs =================================================================== --- xword/ContentFiltering/Office/Word/Cleaners/CommentsRemover.cs (revision 0) +++ xword/ContentFiltering/Office/Word/Cleaners/CommentsRemover.cs (revision 0) @@ -0,0 +1,35 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using XWiki.Html; + +namespace ContentFiltering.Office.Word.Cleaners +{ + /// + /// Removes comments ('<!-- ... >' and '<![ ... ]>') from an html source. + /// + public class CommentsRemover : IHTMLCleaner + { + private HtmlUtil htmlUtil; + + public CommentsRemover() + { + htmlUtil = new HtmlUtil(); + } + #region IHTMLCleaner Members + + /// + /// Removes comments ('<!-- ... >' and '<![ ... ]>') from an html source. + /// + /// The HTML source to clean. + /// The cleaned HTML source (without comments) + public string Clean(string htmlSource) + { + string cleanHTML = htmlUtil.RemoveSpecificTagContent(htmlSource, ""); + return htmlUtil.RemoveSpecificTagContent(cleanHTML, ""); + } + + #endregion IHTMLCleaner Members + } +} Index: xword/ContentFiltering/Office/Word/Cleaners/CorrectAttributesCleaner.cs =================================================================== --- xword/ContentFiltering/Office/Word/Cleaners/CorrectAttributesCleaner.cs (revision 0) +++ xword/ContentFiltering/Office/Word/Cleaners/CorrectAttributesCleaner.cs (revision 0) @@ -0,0 +1,87 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace ContentFiltering.Office.Word.Cleaners +{ + /// + /// Corrects the attributes that miss ' or ". + /// + public class CorrectAttributesCleaner : IHTMLCleaner + { + #region IHTMLCleaner Members + + /// + /// Corrects the attributes that miss ' or ". + /// + /// The original html source code. + /// The source with corrected attributes. + public string Clean(string htmlSource) + { + StringBuilder sb = new StringBuilder(htmlSource); + List tags = GetTagsWithAttributes(htmlSource); + foreach (String initialValue in tags) + { + String value = initialValue; + char[] separators = { ' ', '>', '/', '\r' }; + bool hasChanged = false; + foreach (String s in initialValue.Split(separators)) + { + String[] attribute = s.Split('='); + if (attribute.Length == 2) + { + try + { + String newValue = attribute[1]; + if (attribute[1][0] != '\'' && attribute[1][0] != '\"') + { + newValue = attribute[0] + "=\"" + attribute[1] + "\""; + value = value.Replace(s, newValue); + hasChanged = true; + } + } + catch (IndexOutOfRangeException) { }; + } + } + if (hasChanged) + { + sb = sb.Replace(initialValue, value); + } + } + return sb.ToString(); + } + + #endregion IHTMLCleaner Members + + /// + /// Gets a list with all the tags that contain attributes. + /// + /// The html source. + /// A list of strings with the tags containing attributes. + public List GetTagsWithAttributes(String htmlSource) + { + List tags = new List(); + int startIndex = 0; + int endIndex = 0; + do + { + startIndex = htmlSource.IndexOf('<', endIndex); + if (startIndex >= 0) + { + endIndex = htmlSource.IndexOf('>', startIndex); + if (endIndex >= 0) + { + String tag = htmlSource.Substring(startIndex, endIndex - startIndex + 1); + if (tag.Contains('=')) + { + tags.Add(tag); + } + } + } + + } while (startIndex < (htmlSource.Length - 1) && endIndex < (htmlSource.Length - 1) && (startIndex >= 0) && (endIndex >= 0)); + return tags; + } + } +} Index: xword/ContentFiltering/Office/Word/Cleaners/CorrectTagsClosingCleaner.cs =================================================================== --- xword/ContentFiltering/Office/Word/Cleaners/CorrectTagsClosingCleaner.cs (revision 0) +++ xword/ContentFiltering/Office/Word/Cleaners/CorrectTagsClosingCleaner.cs (revision 0) @@ -0,0 +1,64 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace ContentFiltering.Office.Word.Cleaners +{ + /// + /// Corrects the <img> and <br> tags generated by Word. + /// + class CorrectTagsClosingCleaner : IHTMLCleaner + { + private string tagName; + + /// + /// Cleaner constructor. + /// + /// The name if the tag. Eg: "img", "br". + public CorrectTagsClosingCleaner(string tagName) + { + this.tagName = tagName; + } + + #region IHTMLCleaner Members + + /// + /// Corrects the img and br tags generated by Word. + /// + /// The html source to be corrected. + /// The corrected html coe. + public string Clean(string htmlSource) + { + //The string builder will be appendend when more then 1000 corrupted tags are found. + int slack = 1000; + string correctionString = " /"; + string searchedString = "<" + tagName; + StringBuilder sb = new StringBuilder(htmlSource.Length + slack); + sb.Insert(0, htmlSource); + int startIndex = 0; + int endIndex = 0; + int nonValidTags = 0; + do + { + startIndex = htmlSource.IndexOf(searchedString, endIndex); + if (startIndex >= 0) + { + endIndex = htmlSource.IndexOf('>', startIndex); + if (endIndex > 0) + { + //The tag is missing the '/' before the '>' character + if (!(htmlSource[endIndex - 1].CompareTo('/') == 0)) + { + sb.Insert(endIndex + nonValidTags * correctionString.Length, correctionString); + nonValidTags++; + } + } + } + } while (startIndex < (htmlSource.Length - 1) && endIndex < (htmlSource.Length - 1) && (startIndex >= 0) && (endIndex >= 0)); + return sb.ToString(); + } + + #endregion IHTMLCleaner Members + } +} Index: xword/ContentFiltering/Office/Word/Cleaners/DoctypeRemover.cs =================================================================== --- xword/ContentFiltering/Office/Word/Cleaners/DoctypeRemover.cs (revision 0) +++ xword/ContentFiltering/Office/Word/Cleaners/DoctypeRemover.cs (revision 0) @@ -0,0 +1,31 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace ContentFiltering.Office.Word.Cleaners +{ + /// + /// Removes the doctype declaration from a given html code. + /// + public class DoctypeRemover : IHTMLCleaner + { + #region IHTMLCleaner Members + + /// + /// Removes the doctype declaration from a given html code. + /// + /// The original html code. + /// The modified html code. + public string Clean(string htmlCode) + { + int startIndex, endIndex; + startIndex = htmlCode.IndexOf("", startIndex); + return htmlCode.Remove(startIndex, endIndex - startIndex); + } + + #endregion IHTMLCleaner Members + + } +} Index: xword/ContentFiltering/Office/Word/Cleaners/EmptyParagraphsCleaner.cs =================================================================== --- xword/ContentFiltering/Office/Word/Cleaners/EmptyParagraphsCleaner.cs (revision 0) +++ xword/ContentFiltering/Office/Word/Cleaners/EmptyParagraphsCleaner.cs (revision 0) @@ -0,0 +1,29 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace ContentFiltering.Office.Word.Cleaners +{ + /// + /// Replaces empty paragraphs with line breaks ('<br/>'). + /// + public class EmptyParagraphsCleaner : IHTMLCleaner + { + #region IHTMLCleaner Members + + /// + /// Replaces empty paragraphs with line breaks ('<br/>'). + /// + /// Initial HTML source. + /// Cleaned HTML source (empty paragraphs replaced with line breaks). + public string Clean(string htmlSource) + { + htmlSource = htmlSource.Replace("", "
"); + htmlSource = htmlSource.Replace("

 

", "
"); + return htmlSource; + } + + #endregion IHTMLCleaner Members + } +} Index: xword/ContentFiltering/Office/Word/Cleaners/HeadSectionRemover.cs =================================================================== --- xword/ContentFiltering/Office/Word/Cleaners/HeadSectionRemover.cs (revision 0) +++ xword/ContentFiltering/Office/Word/Cleaners/HeadSectionRemover.cs (revision 0) @@ -0,0 +1,37 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using XWiki.Html; + +namespace ContentFiltering.Office.Word.Cleaners +{ + /// + /// Removes the head section from an html source. + /// + public class HeadSectionRemover : IHTMLCleaner + { + private HtmlUtil htmlUtil; + + /// + /// Default constructor. + /// + public HeadSectionRemover() + { + htmlUtil = new HtmlUtil(); + } + #region IHTMLCleaner Members + + /// + /// Removes the head section from an html source. + /// + /// The HTML source. + /// The HTML source without the head section. + public string Clean(string htmlSource) + { + return htmlUtil.RemoveSpecificTagContent(htmlSource, "", ""); + } + + #endregion IHTMLCleaner Members + } +} Index: xword/ContentFiltering/Office/Word/Cleaners/IHTMLCleaner.cs =================================================================== --- xword/ContentFiltering/Office/Word/Cleaners/IHTMLCleaner.cs (revision 0) +++ xword/ContentFiltering/Office/Word/Cleaners/IHTMLCleaner.cs (revision 0) @@ -0,0 +1,15 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace ContentFiltering.Office.Word.Cleaners +{ + /// + /// Interface for HTML cleaners (pre-DOM filters). + /// + public interface IHTMLCleaner + { + string Clean(string htmlSource); + } +} Index: xword/ContentFiltering/Office/Word/Cleaners/ListCharsCleaner.cs =================================================================== --- xword/ContentFiltering/Office/Word/Cleaners/ListCharsCleaner.cs (revision 0) +++ xword/ContentFiltering/Office/Word/Cleaners/ListCharsCleaner.cs (revision 0) @@ -0,0 +1,30 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace ContentFiltering.Office.Word.Cleaners +{ + /// + /// Replaces some characters used by MS Word for bullet lists with 'o' characters. + /// + public class ListCharsCleaner : IHTMLCleaner + { + #region IHTMLCleaner Members + + /// + /// Replaces some characters used by MS Word for bullet lists (like &middot;) + /// with 'o' characters. + /// + /// Initial HTML source. + /// Cleaned HTML source. + public string Clean(string htmlSource) + { + htmlSource = htmlSource.Replace('·', 'o'); + htmlSource = htmlSource.Replace('§', 'o'); + return htmlSource; + } + + #endregion IHTMLCleaner Members + } +} Index: xword/ContentFiltering/Office/Word/Cleaners/NbspBetweenTagsRemover.cs =================================================================== --- xword/ContentFiltering/Office/Word/Cleaners/NbspBetweenTagsRemover.cs (revision 0) +++ xword/ContentFiltering/Office/Word/Cleaners/NbspBetweenTagsRemover.cs (revision 0) @@ -0,0 +1,27 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace ContentFiltering.Office.Word.Cleaners +{ + /// + /// Removes the &nbsp; between tags. + /// + public class NbspBetweenTagsRemover:IHTMLCleaner + { + #region IHTMLCleaner Members + + /// + /// Removes the &nbsp; between tags. + /// + /// Initial HTML source. + /// Cleaned HTML. + public string Clean(string htmlSource) + { + return htmlSource.Replace("> <", "><"); + } + + #endregion IHTMLCleaner Members + } +} Index: xword/ContentFiltering/Office/Word/Cleaners/NbspReplacer.cs =================================================================== --- xword/ContentFiltering/Office/Word/Cleaners/NbspReplacer.cs (revision 0) +++ xword/ContentFiltering/Office/Word/Cleaners/NbspReplacer.cs (revision 0) @@ -0,0 +1,27 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace ContentFiltering.Office.Word.Cleaners +{ + /// + /// Removes the &nbsp; between tags. + /// + public class NbspReplacer : IHTMLCleaner + { + #region IHTMLCleaner Members + + /// + /// Removes the &nbsp; between tags. + /// + /// Initial HTML source. + /// Cleaned HTML. + public string Clean(string htmlSource) + { + return htmlSource.Replace(" ", " "); + } + + #endregion IHTMLCleaner Members + } +} Index: xword/ContentFiltering/Office/Word/Cleaners/OfficeNameSpacesTagsRemover.cs =================================================================== --- xword/ContentFiltering/Office/Word/Cleaners/OfficeNameSpacesTagsRemover.cs (revision 0) +++ xword/ContentFiltering/Office/Word/Cleaners/OfficeNameSpacesTagsRemover.cs (revision 0) @@ -0,0 +1,46 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace ContentFiltering.Office.Word.Cleaners +{ + /// + /// Removes the tags that are in the office namespaces. + /// + public class OfficeNameSpacesTagsRemover : IHTMLCleaner + { + #region IHTMLCleaner Members + + /// + /// Removes the tags that are in the office namespaces. + /// + /// The original content. + /// The cleaned content. + public string Clean(string htmlSource) + { + bool foundTags = false; + int startIndex = 0; + int endIndex = 0; + do + { + foundTags = false; + startIndex = htmlSource.IndexOf("= 0) + { + endIndex = htmlSource.IndexOf("= 0) + { + endIndex = htmlSource.IndexOf(">", endIndex + 1); + htmlSource = htmlSource.Remove(startIndex, endIndex - startIndex + 1); + } + foundTags = true; + startIndex = endIndex - (endIndex - startIndex + 1); + } + } while (foundTags); + return htmlSource; + } + + #endregion IHTMLCleaner Members + } +} Index: xword/ContentFiltering/Office/Word/Cleaners/TidyHTMLCleaner.cs =================================================================== --- xword/ContentFiltering/Office/Word/Cleaners/TidyHTMLCleaner.cs (revision 0) +++ xword/ContentFiltering/Office/Word/Cleaners/TidyHTMLCleaner.cs (revision 0) @@ -0,0 +1,70 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using TidyNet; +using System.IO; +using XWiki; + +namespace ContentFiltering.Office.Word.Cleaners +{ + /// + /// Uses Tidy.Net to clean a html source. + /// + public class TidyHTMLCleaner : IHTMLCleaner + { + private bool isWordHtml; + + public TidyHTMLCleaner(bool isWordHtml) + { + this.isWordHtml = isWordHtml; + } + #region IHTMLCleaner Members + /// + /// Uses Tidy.Net to clean a html source. + /// + /// The original html source. + /// Specifies if the source is an output from Microsoft Word + /// The cleaned Html. + public string Clean(string htmlSource) + { + Tidy tidy = new Tidy(); + //Options required dor xhtml conversion. + tidy.Options.DocType = DocType.Strict; + tidy.Options.DropFontTags = true; + tidy.Options.LogicalEmphasis = true; + tidy.Options.Xhtml = true; + tidy.Options.XmlOut = true; + tidy.Options.MakeClean = true; + tidy.Options.TidyMark = false; + tidy.Options.DropEmptyParas = true; + tidy.Options.IndentContent = true; + tidy.Options.SmartIndent = true; + tidy.Options.Word2000 = isWordHtml; + tidy.Options.EncloseBlockText = true; + + tidy.Options.XmlTags = true; + tidy.Options.FixComments = true; + TidyMessageCollection tmc = new TidyMessageCollection(); + MemoryStream input = new MemoryStream(); + MemoryStream output = new MemoryStream(); + + byte[] byteArray = Encoding.UTF8.GetBytes(htmlSource); + input.Write(byteArray, 0, byteArray.Length); + input.Position = 0; + try + { + tidy.Parse(input, output, tmc); + } + catch (FormatException ex) + { + Log.Exception(ex); + return htmlSource; + } + string cleanContent = Encoding.UTF8.GetString(output.ToArray()); + return cleanContent; + } + + #endregion IHTMLCleaner Members + } +} Index: xword/ContentFiltering/Office/Word/Cleaners/XmlNamespaceDefinitionsReplacer.cs =================================================================== --- xword/ContentFiltering/Office/Word/Cleaners/XmlNamespaceDefinitionsReplacer.cs (revision 0) +++ xword/ContentFiltering/Office/Word/Cleaners/XmlNamespaceDefinitionsReplacer.cs (revision 0) @@ -0,0 +1,75 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace ContentFiltering.Office.Word.Cleaners +{ + /// + /// Replaces the opening html tag with a given one. + /// + public class XmlNamespaceDefinitionsReplacer : IHTMLCleaner + { + private string newHtmlTag; + + /// + /// Creates a new XmlNamespaceDefinitionsReplacer HTML cleaner (pre-DOM filter). + /// + /// Openening html tag that will replace the current one. + public XmlNamespaceDefinitionsReplacer(string newHtmlTag) + { + this.newHtmlTag = newHtmlTag; + } + + #region IHTMLCleaner Members + /// + /// Replaces the opening html tag with a given one. + /// + /// The html source. + /// The new html tag. + /// Cleaned HTML source. + public string Clean(string htmlCode) + { + String oldHtmlTag = GetXmlNamespaceDefinitions(htmlCode); + if (oldHtmlTag == null) + { + if (!htmlCode.Contains(""); + htmlCode = htmlCode.Insert(htmlCode.Length, ""); + } + htmlCode = htmlCode.Insert(0, newHtmlTag); + htmlCode = htmlCode.Insert(htmlCode.Length, ""); + } + else + { + htmlCode = htmlCode.Replace(oldHtmlTag, newHtmlTag); + } + return htmlCode; + } + + #endregion IHTMLCleaner Members + + /// + /// Gets a string representing the opening html tag with the XML namespace definitions, if any. + /// + /// The html source to be processed + /// a string representing the opening html tag. + public String GetXmlNamespaceDefinitions(String htmlCode) + { + int startIndex, endIndex; + startIndex = htmlCode.IndexOf("", startIndex); + return htmlCode.Substring(startIndex, endIndex - startIndex + 1); + } + } + + } +} Index: xword/ContentFiltering/Office/Word/LocalToWebHTML.cs =================================================================== --- xword/ContentFiltering/Office/Word/LocalToWebHTML.cs (revision 21692) +++ xword/ContentFiltering/Office/Word/LocalToWebHTML.cs (working copy) @@ -9,6 +9,7 @@ using System.Collections; using ContentFiltering.Office.Word; using ContentFiltering.Office.Word.Filters; +using ContentFiltering.Office.Word.Cleaners; namespace XWiki.Office.Word { @@ -29,26 +30,24 @@ { XmlDocument xmlDoc = new XmlDocument(); //xmlDoc.XmlResolver = null; - String uncleanedContent = htmlUtil.CorrectAttributes(content); - uncleanedContent = htmlUtil.CorrectTagsClosing(uncleanedContent, "img"); - uncleanedContent = htmlUtil.CorrectTagsClosing(uncleanedContent, "br"); - content = htmlUtil.CleanHTML(uncleanedContent, true); + + String uncleanedContent = new CorrectAttributesCleaner().Clean(content); + uncleanedContent = new CorrectTagsClosingCleaner("img").Clean(uncleanedContent); + uncleanedContent = new CorrectTagsClosingCleaner("br").Clean(uncleanedContent); + content = new TidyHTMLCleaner(true).Clean(uncleanedContent); + if (content.Length == 0) { content = uncleanedContent; } - //content = htmlUtil.RemoveOfficeNameSpacesTags(content); - //content = htmlUtil.ReplaceBody(content, ""); - content = htmlUtil.ReplaceXmlNamespaceDefinitions(content, HTML_OPENING_TAG); - content = content.Replace('·','o'); - content = content.Replace('§', 'o');//"·"; "o"; "§"; - //Removing   from Word and Tidy output - content = content.Replace("", "
"); - content = content.Replace("

 

", "
"); - content = content.Replace("> <", "><"); - content = content.Replace("", ""); - content = content.Replace("", ""); - content = content.Replace(" ", " "); + + content = new XmlNamespaceDefinitionsReplacer(HTML_OPENING_TAG).Clean(content); + content = new ListCharsCleaner().Clean(content); + content = new EmptyParagraphsCleaner().Clean(content); + content = new NbspBetweenTagsRemover().Clean(content); + content = new OfficeNameSpacesTagsRemover().Clean(content); + content = new NbspReplacer().Clean(content); + xmlDoc.LoadXml(content); List contentFilters = new List() Index: xword/ContentFiltering/Office/Word/WebToLocalHTML.cs =================================================================== --- xword/ContentFiltering/Office/Word/WebToLocalHTML.cs (revision 21692) +++ xword/ContentFiltering/Office/Word/WebToLocalHTML.cs (working copy) @@ -12,6 +12,7 @@ using XWiki.Xml; using System.Collections; using ContentFiltering.Office.Word.Filters; +using ContentFiltering.Office.Word.Cleaners; namespace XWiki.Office.Word { @@ -92,14 +93,13 @@ public String AdaptSource(String content) { XmlDocument xmlDoc = new XmlDocument(); - content = htmlUtil.RemoveOfficeNameSpacesTags(content); - //String namespaces = htmlUtil.GetXmlNamespaceDefinitions(content); - content = htmlUtil.CleanHTML(content, false); - content = htmlUtil.ReplaceXmlNamespaceDefinitions(content, HTML_OPENING_TAG); - content = content.Replace("", "
"); - content = content.Replace("

 

", "
"); - content = content.Replace("> <", "><"); - content = content.Replace(" ", " "); + content = new OfficeNameSpacesTagsRemover().Clean(content); + + content = new TidyHTMLCleaner(false).Clean(content); + content = new XmlNamespaceDefinitionsReplacer(HTML_OPENING_TAG).Clean(content); + content = new EmptyParagraphsCleaner().Clean(content); + content = new NbspBetweenTagsRemover().Clean(content); + content = new NbspReplacer().Clean(content); //content = content.Insert(0, DOCTYPE); try { @@ -111,6 +111,7 @@ return "Sorry, a problem appeared when loading the page"; } + List webToLocalFilters = new List() { new WebMacrosAdaptorFilter(manager), Index: xword/ContentFiltering/Test/Office/Word/Cleaners/BodyContentExtractorTest.cs =================================================================== --- xword/ContentFiltering/Test/Office/Word/Cleaners/BodyContentExtractorTest.cs (revision 0) +++ xword/ContentFiltering/Test/Office/Word/Cleaners/BodyContentExtractorTest.cs (revision 0) @@ -0,0 +1,43 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using NUnit.Framework; +using ContentFiltering.Office.Word.Cleaners; + +namespace ContentFiltering.Test.Office.Word.Cleaners +{ + /// + /// Test class for BodyContentExtractor. + /// + [TestFixture] + public class BodyContentExtractorTest + { + private string initialHTML; + private string expectedHTML; + + /// + /// Default constructor. + /// + public BodyContentExtractorTest() + { + initialHTML = ""; + expectedHTML = ""; + } + + [TestFixtureSetUp] + public void TestSetup() + { + initialHTML = "

Header 1

Body Content goes here

"; + expectedHTML = "

Header 1

Body Content goes here

"; + } + + [Test] + public void TestCleaner() + { + IHTMLCleaner bodyContentExctractor = new BodyContentExtractor(); + initialHTML = bodyContentExctractor.Clean(initialHTML); + Assert.AreEqual(initialHTML, expectedHTML); + } + } +} Index: xword/ContentFiltering/Test/Office/Word/Cleaners/CommentsRemoverTest.cs =================================================================== --- xword/ContentFiltering/Test/Office/Word/Cleaners/CommentsRemoverTest.cs (revision 0) +++ xword/ContentFiltering/Test/Office/Word/Cleaners/CommentsRemoverTest.cs (revision 0) @@ -0,0 +1,68 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using NUnit.Framework; +using ContentFiltering.Office.Word.Cleaners; + +namespace ContentFiltering.Test.Office.Word.Cleaners +{ + /// + /// Test class for CommentsRemover. + /// + [TestFixture] + public class CommentsRemoverTest + { + private string initialHTML; + private string expectedHTML; + + public CommentsRemoverTest() + { + initialHTML = ""; + expectedHTML = ""; + } + + [TestFixtureSetUp] + public void TestSetup() + { + initialHTML = " " + + "" + + "" + + "" + + "" + + "" + + "

the paragraph

" + + "" + + ""; + + expectedHTML = " " + + "" + + "" + + "" + + "

the paragraph

" + + "" + + ""; + } + + [Test] + public void TestCleaner() + { + initialHTML = new CommentsRemover().Clean(initialHTML); + Assert.AreEqual(initialHTML, expectedHTML); + } + } +} Index: xword/ContentFiltering/Test/Office/Word/Cleaners/CorrectAttributesCleanerTest.cs =================================================================== --- xword/ContentFiltering/Test/Office/Word/Cleaners/CorrectAttributesCleanerTest.cs (revision 0) +++ xword/ContentFiltering/Test/Office/Word/Cleaners/CorrectAttributesCleanerTest.cs (revision 0) @@ -0,0 +1,51 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using NUnit.Framework; +using ContentFiltering.Office.Word.Cleaners; + +namespace ContentFiltering.Test.Office.Word.Cleaners +{ + /// + /// Test for CorrectAttributesCleaner pre-DOM filter. + /// + [TestFixture] + public class CorrectAttributesCleanerTest + { + private string initialHTML; + private string expectedHTML; + + /// + /// Default constructor. + /// + public CorrectAttributesCleanerTest() + { + initialHTML = ""; + expectedHTML = ""; + } + + [TestFixtureSetUp] + public void GlobalSetup() + { + initialHTML = "Title" + + "

text

" + + "

text

" + + "" + + "red text" + + ""; + expectedHTML = "Title" + + "

text

" + + "

text

" + + "

copyright notes

" + + "red text" + + ""; + } + [Test] + public void TestCleaner() + { + initialHTML = new CorrectAttributesCleaner().Clean(initialHTML); + Assert.AreEqual(initialHTML, expectedHTML); + } + } +} Index: xword/ContentFiltering/Test/Office/Word/Cleaners/CorrectTagsClosingCleanerTest.cs =================================================================== --- xword/ContentFiltering/Test/Office/Word/Cleaners/CorrectTagsClosingCleanerTest.cs (revision 0) +++ xword/ContentFiltering/Test/Office/Word/Cleaners/CorrectTagsClosingCleanerTest.cs (revision 0) @@ -0,0 +1,55 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using NUnit.Framework; +using ContentFiltering.Office.Word.Cleaners; + +namespace ContentFiltering.Test.Office.Word.Cleaners +{ + /// + /// Test class for CorrectTagsClosingCleaner pre-DOM filter. + /// + [TestFixture] + public class CorrectTagsClosingCleanerTest + { + private string initialHTML1; + private string initialHTML2; + private string expectedHTML1; + private string expectedHTML2; + + /// + /// Default constructor. + /// + public CorrectTagsClosingCleanerTest() + { + initialHTML1 = ""; + initialHTML2 = ""; + expectedHTML1 = ""; + expectedHTML2 = ""; + } + + [TestFixtureSetUp] + public void TestSetup() + { + initialHTML1 = ""; + initialHTML2 = "

Text
Text

"; + + expectedHTML1 = ""; + expectedHTML2 = "

Text
Text

"; + } + + [Test] + public void TestCleaner() + { + IHTMLCleaner tagClosingCleaner1 = new CorrectTagsClosingCleaner("img"); + initialHTML1 = tagClosingCleaner1.Clean(initialHTML1); + + IHTMLCleaner tagClosingCleaner2 = new CorrectTagsClosingCleaner("br"); + initialHTML2 = tagClosingCleaner2.Clean(initialHTML2); + + Assert.AreEqual(initialHTML1, expectedHTML1); + Assert.AreEqual(initialHTML2, expectedHTML2); + } + } +} Index: xword/ContentFiltering/Test/Office/Word/Cleaners/DoctypeRemoverTest.cs =================================================================== --- xword/ContentFiltering/Test/Office/Word/Cleaners/DoctypeRemoverTest.cs (revision 0) +++ xword/ContentFiltering/Test/Office/Word/Cleaners/DoctypeRemoverTest.cs (revision 0) @@ -0,0 +1,49 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using NUnit.Framework; +using ContentFiltering.Office.Word.Cleaners; + +namespace ContentFiltering.Test.Office.Word.Cleaners +{ + /// + /// Test class for DoctypeRemover. + /// + [TestFixture] + public class DoctypeRemoverTest + { + private string initialHTML; + private string expectedHTML; + + /// + /// Default constructor. + /// + public DoctypeRemoverTest() + { + initialHTML = ""; + expectedHTML = ""; + } + + [TestFixtureSetUp] + public void TestSetup() + { + initialHTML = " " + + " " + + "Content" + + ""; + + expectedHTML = " " + + "Content" + + ""; + } + + [Test] + public void TestCleaner() + { + IHTMLCleaner doctypeRemover = new DoctypeRemover(); + initialHTML = doctypeRemover.Clean(initialHTML); + Assert.AreEqual(initialHTML, expectedHTML); + } + } +} Index: xword/ContentFiltering/Test/Office/Word/Cleaners/EmptyParagraphsCleanerTest.cs =================================================================== --- xword/ContentFiltering/Test/Office/Word/Cleaners/EmptyParagraphsCleanerTest.cs (revision 0) +++ xword/ContentFiltering/Test/Office/Word/Cleaners/EmptyParagraphsCleanerTest.cs (revision 0) @@ -0,0 +1,53 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using NUnit.Framework; +using ContentFiltering.Office.Word.Cleaners; + +namespace ContentFiltering.Test.Office.Word.Cleaners +{ + /// + /// Test class for EmptyParagraphsCleaner. + /// + [TestFixture] + public class EmptyParagraphsCleanerTest + { + private string initialHTML; + private string expectedHTML; + + /// + /// Default constructor. + /// + public EmptyParagraphsCleanerTest() + { + initialHTML = ""; + expectedHTML = ""; + } + + [TestFixtureSetUp] + public void TestSetup() + { + initialHTML = " " + + "

Text

 

"; + + expectedHTML = " " + + "

Text



"; + } + + [Test] + public void TestCleaner() + { + initialHTML = new EmptyParagraphsCleaner().Clean(initialHTML); + Assert.AreEqual(initialHTML, expectedHTML); + } + } +} Index: xword/ContentFiltering/Test/Office/Word/Cleaners/HeadSectionRemoverTest.cs =================================================================== --- xword/ContentFiltering/Test/Office/Word/Cleaners/HeadSectionRemoverTest.cs (revision 0) +++ xword/ContentFiltering/Test/Office/Word/Cleaners/HeadSectionRemoverTest.cs (revision 0) @@ -0,0 +1,44 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using NUnit.Framework; +using ContentFiltering.Office.Word.Cleaners; + +namespace ContentFiltering.Test.Office.Word.Cleaners +{ + /// + /// Test class for HeadSectionRemover. + /// + [TestFixture] + public class HeadSectionRemoverTest + { + private string initialHTML; + private string expectedHTML; + + /// + /// Default constructor. + /// + public HeadSectionRemoverTest() + { + initialHTML = ""; + expectedHTML=""; + } + + [TestFixtureSetUp] + public void TestSetup() + { + initialHTML = "Title" + + "

Content

"; + + expectedHTML = "

Content

"; + } + + [Test] + public void TestCleaner() + { + initialHTML = new HeadSectionRemover().Clean(initialHTML); + Assert.AreEqual(initialHTML, expectedHTML); + } + } +} Index: xword/ContentFiltering/Test/Office/Word/Cleaners/ListCharsCleanerTest.cs =================================================================== --- xword/ContentFiltering/Test/Office/Word/Cleaners/ListCharsCleanerTest.cs (revision 0) +++ xword/ContentFiltering/Test/Office/Word/Cleaners/ListCharsCleanerTest.cs (revision 0) @@ -0,0 +1,42 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using NUnit.Framework; +using ContentFiltering.Office.Word.Cleaners; + +namespace ContentFiltering.Test.Office.Word.Cleaners +{ + /// + /// Test class for ListCharsCleaner. + /// + [TestFixture] + public class ListCharsCleanerTest + { + private string initialHTML; + private string expectedHTML; + + /// + /// Default constructor. + /// + public ListCharsCleanerTest() + { + initialHTML = ""; + expectedHTML = ""; + } + + [TestFixtureSetUp] + public void TestSetup() + { + initialHTML = "

·Item1
·Item2

§Another item

"; + expectedHTML = "

oItem1
oItem2

oAnother item

"; + } + + [Test] + public void TestCleaner() + { + initialHTML = new ListCharsCleaner().Clean(initialHTML); + Assert.AreEqual(initialHTML, expectedHTML); + } + } +} Index: xword/ContentFiltering/Test/Office/Word/Cleaners/NbspBetweenTagsRemoverTest.cs =================================================================== --- xword/ContentFiltering/Test/Office/Word/Cleaners/NbspBetweenTagsRemoverTest.cs (revision 0) +++ xword/ContentFiltering/Test/Office/Word/Cleaners/NbspBetweenTagsRemoverTest.cs (revision 0) @@ -0,0 +1,42 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using NUnit.Framework; +using ContentFiltering.Office.Word.Cleaners; + +namespace ContentFiltering.Test.Office.Word.Cleaners +{ + /// + /// Test class for NbspBetweenTagsRemover. + /// + [TestFixture] + public class NbspBetweenTagsRemoverTest + { + private string initialHTML; + private string expectedHTML; + + /// + /// Default constructor. + /// + public NbspBetweenTagsRemoverTest() + { + initialHTML = ""; + expectedHTML = ""; + } + + [TestFixtureSetUp] + public void TestSetup() + { + initialHTML = "

 text

 

"; + expectedHTML = "

text

"; + } + + [Test] + public void Test() + { + initialHTML = new NbspBetweenTagsRemover().Clean(initialHTML); + Assert.AreEqual(initialHTML, expectedHTML); + } + } +} Index: xword/ContentFiltering/Test/Office/Word/Cleaners/NbspReplacerTest.cs =================================================================== --- xword/ContentFiltering/Test/Office/Word/Cleaners/NbspReplacerTest.cs (revision 0) +++ xword/ContentFiltering/Test/Office/Word/Cleaners/NbspReplacerTest.cs (revision 0) @@ -0,0 +1,42 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using NUnit.Framework; +using ContentFiltering.Office.Word.Cleaners; + +namespace ContentFiltering.Test.Office.Word.Cleaners +{ + /// + /// Test class for NbspReplacer. + /// + [TestFixture] + public class NbspReplacerTest + { + private string initialHTML; + private string expectedHTML; + + /// + /// Default constructor. + /// + public NbspReplacerTest() + { + initialHTML = ""; + expectedHTML = ""; + } + + [TestFixtureSetUp] + public void TestSetup() + { + initialHTML = "

Some content

More content here

"; + expectedHTML = "

Some content

More content here

"; + } + + [Test] + public void Test() + { + initialHTML = new NbspReplacer().Clean(initialHTML); + Assert.AreEqual(initialHTML, expectedHTML); + } + } +} Index: xword/ContentFiltering/Test/Office/Word/Cleaners/OfficeNameSpacesTagsRemoverTest.cs =================================================================== --- xword/ContentFiltering/Test/Office/Word/Cleaners/OfficeNameSpacesTagsRemoverTest.cs (revision 0) +++ xword/ContentFiltering/Test/Office/Word/Cleaners/OfficeNameSpacesTagsRemoverTest.cs (revision 0) @@ -0,0 +1,54 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using NUnit.Framework; +using ContentFiltering.Office.Word.Cleaners; + +namespace ContentFiltering.Test.Office.Word.Cleaners +{ + /// + /// Test class for OfficeNameSpacesTagsRemover. + /// + [TestFixture] + public class OfficeNameSpacesTagsRemoverTest + { + private string initialHTML; + private string expectedHTML; + + /// + /// Default constructor. + /// + public OfficeNameSpacesTagsRemoverTest() + { + initialHTML = ""; + expectedHTML = ""; + } + + [TestFixtureSetUp] + public void TestSetup() + { + initialHTML = "" + + "

Text

 "; + + expectedHTML="" + + "

Text

"; + } + + [Test] + public void TestCleaner() + { + IHTMLCleaner officeNameSpaceCleaner = new OfficeNameSpacesTagsRemover(); + initialHTML = officeNameSpaceCleaner.Clean(initialHTML); + Assert.AreEqual(initialHTML, expectedHTML); + } + } +} Index: xword/ContentFiltering/Test/Office/Word/Cleaners/XmlNamespaceDefinitionsReplacerTest.cs =================================================================== --- xword/ContentFiltering/Test/Office/Word/Cleaners/XmlNamespaceDefinitionsReplacerTest.cs (revision 0) +++ xword/ContentFiltering/Test/Office/Word/Cleaners/XmlNamespaceDefinitionsReplacerTest.cs (revision 0) @@ -0,0 +1,55 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using NUnit.Framework; +using ContentFiltering.Office.Word.Cleaners; + +namespace ContentFiltering.Test.Office.Word.Cleaners +{ + /// + /// Test class for XmlNamespaceDefinitionsReplacer. + /// + [TestFixture] + public class XmlNamespaceDefinitionsReplacerTest + { + private string initialHTML; + private string expectedHTML; + private string newTag; + + public XmlNamespaceDefinitionsReplacerTest() + { + initialHTML = ""; + expectedHTML = ""; + newTag = ""; + } + + [TestFixtureSetUp] + public void TestSetup() + { + initialHTML = "

Content

"; + expectedHTML = "" + + "

Content

"; + newTag = ""; + } + + + [Test] + public void TestCleaner() + { + initialHTML = new XmlNamespaceDefinitionsReplacer(newTag).Clean(initialHTML); + Assert.AreEqual(initialHTML, expectedHTML); + + } + + + } +} Index: xword/XWord/AddinActions.cs =================================================================== --- xword/XWord/AddinActions.cs (revision 21692) +++ xword/XWord/AddinActions.cs (working copy) @@ -17,6 +17,7 @@ using Microsoft.Office.Core; using XWord.VstoExtensions; using XWiki.Logging; +using ContentFiltering.Office.Word.Cleaners; namespace XWord { @@ -32,7 +33,7 @@ HtmlUtil htmlUtil = new HtmlUtil(); //A dictionary, storing the converter instances for each opened page. Dictionary pageConverters = new Dictionary(); - + private const string DOWNLOAD_FOLDER = "XWord"; //"MyDocuments\XWord" private string TEMP_UPLOAD_FILES_FOLDER = Environment.SpecialFolder.ApplicationData.ToString() + @"\XWordTempData\UploadedFiles"; private string TEMP_FILES_FOLDER = Environment.SpecialFolder.ApplicationData.ToString() + @"\XWordTempData\Pages"; @@ -50,8 +51,8 @@ bool checkGrammarWithSpelling = false; bool checkSpellingAsYouType = false; bool contextualSpeller = false; - + /// /// Generic webclient used for conneting to xwiki. /// @@ -124,7 +125,7 @@ { return false; } - + } /// @@ -228,7 +229,7 @@ /// The full name of the wiki page that is being opened for editing. public void EditPage(String pageFullName) { - if(IsOpened(pageFullName)) + if (IsOpened(pageFullName)) { UserNotifier.Message("You are already editing this page."); return; @@ -237,7 +238,7 @@ { Client.Login(addin.username, addin.password); } - if(IsProtectedPage(pageFullName, addin.ProtectedPages)) + if (IsProtectedPage(pageFullName, addin.ProtectedPages)) { String message = "You cannot edit this page." + Environment.NewLine; message += "This page contains scrips that provide functionality to the wiki."; @@ -266,10 +267,10 @@ String pageFullName = (String)_pageFullName; //Read from server String content = Client.GetRenderedPageContent(pageFullName); - + String localFileName = pageFullName.Replace(".", "-"); String folder = addin.PagesRepository + "TempPages"; - ConvertToNormalFolder(folder); + ConvertToNormalFolder(folder); //content = new WebToLocalHTML(addin.serverURL, folder, localFileName).AdaptSource(content); ConversionManager pageConverter; if (pageConverters.ContainsKey(pageFullName)) @@ -303,7 +304,7 @@ addin.EditedPages.Add(localFileName, pageFullName); addin.currentPageFullName = pageFullName; //Open the file with Word - Word.Document doc = OpenHTMLDocument(localFileName); + Word.Document doc = OpenHTMLDocument(localFileName); #endregion//Open local document //Mark just-opened document as saved. This prevents a silly confirmation box that @@ -312,8 +313,8 @@ } catch (IOException ex) { - UserNotifier.Error(ex.Message); - } + UserNotifier.Error(ex.Message); + } } /// @@ -427,10 +428,10 @@ { if (addin.currentPageFullName == "" || addin.currentPageFullName == null) { - UserNotifier.Exclamation("You are not currently editing a wiki page") ; + UserNotifier.Exclamation("You are not currently editing a wiki page"); return; } - + LoadingDialog loadingDialog = new LoadingDialog("Saving to wiki..."); ThreadPool.QueueUserWorkItem(new WaitCallback(loadingDialog.ShowSyncDialog)); SaveToXwiki(); @@ -471,12 +472,10 @@ sr.Close(); File.Delete(contentFilePath); String cleanHTML = ""; - cleanHTML = htmlUtil.RemoveSpecificTagContent(fileContent, ""); - cleanHTML = htmlUtil.RemoveSpecificTagContent(cleanHTML, ""); - //cleanHTML = htmlUtil.RemoveSpecificTagContent(cleanHTML, ""); - cleanHTML = htmlUtil.RemoveSpecificTagContent(cleanHTML, "", ""); - //cleanHTML = htmlUtil.CleanHTML(cleanHTML, true); - //cleanHTML = htmlUtil.GetBodyContent(cleanHTML); + + cleanHTML = new CommentsRemover().Clean(fileContent); + cleanHTML = new HeadSectionRemover().Clean(cleanHTML); + ConversionManager pageConverter; if (pageConverters.ContainsKey(addin.currentPageFullName)) { @@ -488,7 +487,8 @@ addin.currentPageFullName, Path.GetFileName(contentFilePath), addin.Client); } cleanHTML = pageConverter.ConvertFromWordToWeb(cleanHTML); - cleanHTML = htmlUtil.GetBodyContent(cleanHTML); + cleanHTML = new BodyContentExtractor().Clean(cleanHTML); + //openHTMLDocument(addin.currentLocalFilePath); if (addin.AddinStatus.Syntax == null) { @@ -589,7 +589,7 @@ //If it's a new space, add it to the wiki structure and mark it as unpublished List spaces = Globals.XWikiAddIn.wiki.spaces; - Space space=null; + Space space = null; foreach (Space sp in spaces) { if (sp.name == spaceName) @@ -607,7 +607,7 @@ } } - if (space==null) + if (space == null) { space = new Space(); space.name = spaceName; @@ -620,7 +620,7 @@ xwdoc.published = false; xwdoc.space = spaceName; space.documents.Add(xwdoc); - } + } } catch (IOException ex) { @@ -670,27 +670,27 @@ UserNotifier.Error("There was an error on the server. The pages in MSOffice space don't have programming rights"); hasErrors = true; } - else if(content.Contains(HTTPResponses.WRONG_REQUEST)) + else if (content.Contains(HTTPResponses.WRONG_REQUEST)) { Log.Error("Server " + addin.serverURL + " wrong request"); UserNotifier.Error("Server error: Wrong request"); hasErrors = true; } - else if(content.Contains(HTTPResponses.NO_EDIT_RIGHTS)) + else if (content.Contains(HTTPResponses.NO_EDIT_RIGHTS)) { Log.Information("User tried to edit a page on " + addin.serverURL + " whithout edit rights"); UserNotifier.Error("You dont have the right to edit this page"); hasErrors = true; } - else if(content.Contains(HTTPResponses.NO_GROOVY_RIGHTS)) + else if (content.Contains(HTTPResponses.NO_GROOVY_RIGHTS)) { Log.Error("Server " + addin.serverURL + " error on parsing groovy - no groovy rights"); String message = "There was an error on the server." + Environment.NewLine; - message += "Please contact your server adminitrator. Error on executing groovy page in MSOffice space"; + message += "Please contact your server adminitrator. Error on executing groovy page in MSOffice space"; UserNotifier.Error(message); hasErrors = true; } - else if(content.Contains(HTTPResponses.INSUFFICIENT_MEMMORY)) + else if (content.Contains(HTTPResponses.INSUFFICIENT_MEMMORY)) { Log.Error("Server " + addin.serverURL + " reported OutOfMemmoryException"); String message = "There was an error on the server." + Environment.NewLine; @@ -698,7 +698,7 @@ UserNotifier.Error(message); hasErrors = true; } - else if(content.Contains(HTTPResponses.VELOCITY_PARSER_ERROR)) + else if (content.Contains(HTTPResponses.VELOCITY_PARSER_ERROR)) { Log.Error("Server " + addin.serverURL + " error when parsing page. "); String message = "There was an error on the server" + Environment.NewLine; @@ -721,7 +721,7 @@ foreach (String wildcard in wildcards) { String docFullName = doc.space + "." + doc.name; - if(UtilityClass.IsWildcardMatch(wildcard, docFullName, true)) + if (UtilityClass.IsWildcardMatch(wildcard, docFullName, true)) { wiki.RemoveXWikiDocument(doc); break; @@ -781,7 +781,7 @@ UserNotifier.Error(ioex.Message); return false; } - + return true; } }