Index: xword/ContentFiltering/ContentFiltering.csproj
===================================================================
--- xword/ContentFiltering/ContentFiltering.csproj (revision 21692)
+++ xword/ContentFiltering/ContentFiltering.csproj (working copy)
@@ -60,10 +60,24 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -76,6 +90,18 @@
+
+
+
+
+
+
+
+
+
+
+
+
Index: xword/ContentFiltering/Html/HtmlUtil.cs
===================================================================
--- xword/ContentFiltering/Html/HtmlUtil.cs (revision 21692)
+++ xword/ContentFiltering/Html/HtmlUtil.cs (working copy)
@@ -6,6 +6,7 @@
using System.Text.RegularExpressions;
using TidyNet;
using TidyNet.Dom;
+using ContentFiltering.Office.Word.Cleaners;
namespace XWiki.Html
{
@@ -21,7 +22,7 @@
/// The cleaned html code.
public String HtmlToXhtml(String htmlSource)
{
- return CleanHTML(htmlSource, false);
+ return new TidyHTMLCleaner(false).Clean(htmlSource);
}
///
@@ -31,191 +32,11 @@
/// The cleaned html.
public String WordHtmlToXhtml(String htmlSource)
{
- return CleanHTML(htmlSource, true);
+ return new TidyHTMLCleaner(true).Clean(htmlSource);
}
- ///
- /// Uses Tidy.Net to clean a html source.
- ///
- /// The original html source.
- /// Specifies if the source is an output from Microsoft Word
- /// The cleaned Html.
- public String CleanHTML(String htmlSource,bool isWordHtml)
- {
- Tidy tidy = new Tidy();
- //Options required dor xhtml conversion.
- tidy.Options.DocType = DocType.Strict;
- tidy.Options.DropFontTags = true;
- tidy.Options.LogicalEmphasis = true;
- tidy.Options.Xhtml = true;
- tidy.Options.XmlOut = true;
- tidy.Options.MakeClean = true;
- tidy.Options.TidyMark = false;
- tidy.Options.DropEmptyParas = true;
- tidy.Options.IndentContent = true;
- tidy.Options.SmartIndent = true;
- tidy.Options.Word2000 = isWordHtml;
- tidy.Options.EncloseBlockText = true;
-
- tidy.Options.XmlTags = true;
- tidy.Options.FixComments = true;
- TidyMessageCollection tmc = new TidyMessageCollection();
- MemoryStream input = new MemoryStream();
- MemoryStream output = new MemoryStream();
-
- byte[] byteArray = Encoding.UTF8.GetBytes(htmlSource);
- input.Write(byteArray, 0, byteArray.Length);
- input.Position = 0;
- try
- {
- tidy.Parse(input, output, tmc);
- }
- catch (FormatException ex)
- {
- Log.Exception(ex);
- return htmlSource;
- }
- string cleanContent = Encoding.UTF8.GetString(output.ToArray());
- return cleanContent;
- }
///
- /// Gets a list with all the tags that contain attributes.
- ///
- /// The html source.
- /// A of strings with the tags containing attributes.
- public List GetTagsWithAttributes(String htmlSource)
- {
- List tags = new List();
- int startIndex = 0;
- int endIndex = 0;
- do
- {
- startIndex = htmlSource.IndexOf('<', endIndex);
- if (startIndex >= 0)
- {
- endIndex = htmlSource.IndexOf('>', startIndex);
- if (endIndex >= 0)
- {
- String tag = htmlSource.Substring(startIndex, endIndex - startIndex + 1);
- if (tag.Contains('='))
- {
- tags.Add(tag);
- }
- }
- }
-
- } while (startIndex < (htmlSource.Length - 1) && endIndex < (htmlSource.Length - 1) && (startIndex >= 0) && (endIndex >= 0));
- return tags;
- }
-
- ///
- /// Corrects the img and br tags generated by Word.
- ///
- /// The html source to be corrected.
- /// The name if the tag. Eg: "img", "br".
- /// The corrected html coe.
- public String CorrectTagsClosing(String htmlSource, String tagName)
- {
- //The string builder will be appendend when more then 1000 corrupted tags are found.
- int slack = 1000;
- string correctionString = " /";
- string searchedString = "<" + tagName;
- StringBuilder sb = new StringBuilder(htmlSource.Length + slack);
- sb.Insert(0, htmlSource);
- int startIndex = 0;
- int endIndex = 0;
- int nonValidTags = 0;
- do
- {
- startIndex = htmlSource.IndexOf(searchedString, endIndex);
- if (startIndex >= 0)
- {
- endIndex = htmlSource.IndexOf('>', startIndex);
- if (endIndex > 0)
- {
- //The tag is missing the '/' before the '>' character
- if (!(htmlSource[endIndex - 1].CompareTo('/') == 0))
- {
- sb.Insert(endIndex + nonValidTags * correctionString.Length, correctionString);
- nonValidTags++;
- }
- }
- }
- } while (startIndex < (htmlSource.Length - 1) && endIndex < (htmlSource.Length - 1) && (startIndex >= 0) && (endIndex >= 0));
- return sb.ToString();
- }
-
- ///
- /// Corrects the attributes that miss ' or ".
- ///
- /// The original html source code.
- /// The source with corrected attributes.
- public String CorrectAttributes(String htmlSource)
- {
- StringBuilder sb = new StringBuilder(htmlSource);
- List tags = GetTagsWithAttributes(htmlSource);
- foreach(String initialValue in tags)
- {
- String value = initialValue;
- char[] separators = {' ','>','/','\r'};
- bool hasChanged = false;
- foreach (String s in initialValue.Split(separators))
- {
- String[] attribute = s.Split('=');
- if(attribute.Length == 2)
- {
- try
- {
- String newValue = attribute[1];
- if (attribute[1][0] != '\'' && attribute[1][0] != '\"')
- {
- newValue = attribute[0] + "=\"" + attribute[1] + "\"";
- value = value.Replace(s, newValue);
- hasChanged = true;
- }
- }
- catch (IndexOutOfRangeException) { };
- }
- }
- if (hasChanged)
- {
- sb = sb.Replace(initialValue, value);
- }
- }
- return sb.ToString();
- }
-
- ///
- /// Removes the tags that are in the office namespaces.
- ///
- /// The original content.
- /// The cleaned content.
- public String RemoveOfficeNameSpacesTags(String content)
- {
- bool foundTags = false;
- int startIndex = 0;
- int endIndex = 0;
- do
- {
- foundTags = false;
- startIndex = content.IndexOf("= 0)
- {
- endIndex = content.IndexOf("= 0)
- {
- endIndex = content.IndexOf(">",endIndex + 1);
- content = content.Remove(startIndex, endIndex - startIndex + 1);
- }
- foundTags = true;
- startIndex = endIndex - (endIndex - startIndex + 1);
- }
- } while (foundTags);
- return content;
- }
-
- ///
/// Removes a char sequence that starts and ends with the given valaues.
///
/// The initial content.
@@ -245,25 +66,8 @@
return content;
}
+
///
- /// Gets the content between the opening and closing html tags.
- ///
- /// The html source to be
- /// the inner html of the body.
- public String GetBodyContent(String htmlCode)
- {
- //Delete header & footer
- int startIndex, endIndex;
- startIndex = htmlCode.IndexOf("", startIndex);
- htmlCode = htmlCode.Remove(0, endIndex + 1);
- startIndex = htmlCode.IndexOf("= 0)
- htmlCode = htmlCode.Remove(startIndex);
- return htmlCode;
- }
-
- ///
/// Indents the given html source.
///
/// The html source.
@@ -285,67 +89,8 @@
return htmlSource;
}
- ///
- /// Removes the doctype declaration from an given html code.
- ///
- /// The original html code.
- /// The modified html code.
- public String RemoveDoctype(String htmlCode)
- {
- int startIndex, endIndex;
- startIndex = htmlCode.IndexOf("", startIndex);
- return htmlCode.Remove(startIndex, endIndex - startIndex);
- }
///
- /// Gets a string representing the opening html tag with the XML namespace definitions, if any.
- ///
- /// The html source to be processed
- /// a string representing the opening html tag.
- public String GetXmlNamespaceDefinitions(String htmlCode)
- {
- int startIndex, endIndex;
- startIndex = htmlCode.IndexOf("", startIndex);
- return htmlCode.Substring(startIndex, endIndex - startIndex + 1);
- }
- }
-
- ///
- /// Replaces the opening html tag with a given one.
- ///
- /// The html source.
- /// The new html tag.
- ///
- public String ReplaceXmlNamespaceDefinitions(String htmlCode, String newHtmlTag)
- {
- String oldHtmlTag = GetXmlNamespaceDefinitions(htmlCode);
- if (oldHtmlTag == null)
- {
- if (!htmlCode.Contains("");
- htmlCode = htmlCode.Insert(htmlCode.Length, "");
- }
- htmlCode = htmlCode.Insert(0, newHtmlTag);
- htmlCode = htmlCode.Insert(htmlCode.Length, "");
- }
- else
- {
- htmlCode = htmlCode.Replace(oldHtmlTag, newHtmlTag);
- }
- return htmlCode;
- }
-
- ///
/// Replaces the body tag with a new given one.
///
/// The initial html code.
Index: xword/ContentFiltering/Office/Word/Cleaners/BodyContentExtractor.cs
===================================================================
--- xword/ContentFiltering/Office/Word/Cleaners/BodyContentExtractor.cs (revision 0)
+++ xword/ContentFiltering/Office/Word/Cleaners/BodyContentExtractor.cs (revision 0)
@@ -0,0 +1,36 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace ContentFiltering.Office.Word.Cleaners
+{
+ ///
+ /// Gets the inner html of the body.
+ ///
+ public class BodyContentExtractor : IHTMLCleaner
+ {
+
+ #region IHTMLCleaner Members
+
+ ///
+ /// Gets the content between the opening and closing html tags.
+ ///
+ /// The html source to be
+ /// The inner html of the body.
+ public string Clean(string htmlCode)
+ {
+ //Delete header & footer
+ int startIndex, endIndex;
+ startIndex = htmlCode.IndexOf("", startIndex);
+ htmlCode = htmlCode.Remove(0, endIndex + 1);
+ startIndex = htmlCode.IndexOf("= 0)
+ htmlCode = htmlCode.Remove(startIndex);
+ return htmlCode;
+ }
+
+ #endregion IHTMLCleaner Members
+ }
+}
Index: xword/ContentFiltering/Office/Word/Cleaners/CommentsRemover.cs
===================================================================
--- xword/ContentFiltering/Office/Word/Cleaners/CommentsRemover.cs (revision 0)
+++ xword/ContentFiltering/Office/Word/Cleaners/CommentsRemover.cs (revision 0)
@@ -0,0 +1,35 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using XWiki.Html;
+
+namespace ContentFiltering.Office.Word.Cleaners
+{
+ ///
+ /// Removes comments ('<!-- ... >' and '<![ ... ]>') from an html source.
+ ///
+ public class CommentsRemover : IHTMLCleaner
+ {
+ private HtmlUtil htmlUtil;
+
+ public CommentsRemover()
+ {
+ htmlUtil = new HtmlUtil();
+ }
+ #region IHTMLCleaner Members
+
+ ///
+ /// Removes comments ('<!-- ... >' and '<![ ... ]>') from an html source.
+ ///
+ /// The HTML source to clean.
+ /// The cleaned HTML source (without comments)
+ public string Clean(string htmlSource)
+ {
+ string cleanHTML = htmlUtil.RemoveSpecificTagContent(htmlSource, "");
+ return htmlUtil.RemoveSpecificTagContent(cleanHTML, "");
+ }
+
+ #endregion IHTMLCleaner Members
+ }
+}
Index: xword/ContentFiltering/Office/Word/Cleaners/CorrectAttributesCleaner.cs
===================================================================
--- xword/ContentFiltering/Office/Word/Cleaners/CorrectAttributesCleaner.cs (revision 0)
+++ xword/ContentFiltering/Office/Word/Cleaners/CorrectAttributesCleaner.cs (revision 0)
@@ -0,0 +1,87 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace ContentFiltering.Office.Word.Cleaners
+{
+ ///
+ /// Corrects the attributes that miss ' or ".
+ ///
+ public class CorrectAttributesCleaner : IHTMLCleaner
+ {
+ #region IHTMLCleaner Members
+
+ ///
+ /// Corrects the attributes that miss ' or ".
+ ///
+ /// The original html source code.
+ /// The source with corrected attributes.
+ public string Clean(string htmlSource)
+ {
+ StringBuilder sb = new StringBuilder(htmlSource);
+ List tags = GetTagsWithAttributes(htmlSource);
+ foreach (String initialValue in tags)
+ {
+ String value = initialValue;
+ char[] separators = { ' ', '>', '/', '\r' };
+ bool hasChanged = false;
+ foreach (String s in initialValue.Split(separators))
+ {
+ String[] attribute = s.Split('=');
+ if (attribute.Length == 2)
+ {
+ try
+ {
+ String newValue = attribute[1];
+ if (attribute[1][0] != '\'' && attribute[1][0] != '\"')
+ {
+ newValue = attribute[0] + "=\"" + attribute[1] + "\"";
+ value = value.Replace(s, newValue);
+ hasChanged = true;
+ }
+ }
+ catch (IndexOutOfRangeException) { };
+ }
+ }
+ if (hasChanged)
+ {
+ sb = sb.Replace(initialValue, value);
+ }
+ }
+ return sb.ToString();
+ }
+
+ #endregion IHTMLCleaner Members
+
+ ///
+ /// Gets a list with all the tags that contain attributes.
+ ///
+ /// The html source.
+ /// A list of strings with the tags containing attributes.
+ public List GetTagsWithAttributes(String htmlSource)
+ {
+ List tags = new List();
+ int startIndex = 0;
+ int endIndex = 0;
+ do
+ {
+ startIndex = htmlSource.IndexOf('<', endIndex);
+ if (startIndex >= 0)
+ {
+ endIndex = htmlSource.IndexOf('>', startIndex);
+ if (endIndex >= 0)
+ {
+ String tag = htmlSource.Substring(startIndex, endIndex - startIndex + 1);
+ if (tag.Contains('='))
+ {
+ tags.Add(tag);
+ }
+ }
+ }
+
+ } while (startIndex < (htmlSource.Length - 1) && endIndex < (htmlSource.Length - 1) && (startIndex >= 0) && (endIndex >= 0));
+ return tags;
+ }
+ }
+}
Index: xword/ContentFiltering/Office/Word/Cleaners/CorrectTagsClosingCleaner.cs
===================================================================
--- xword/ContentFiltering/Office/Word/Cleaners/CorrectTagsClosingCleaner.cs (revision 0)
+++ xword/ContentFiltering/Office/Word/Cleaners/CorrectTagsClosingCleaner.cs (revision 0)
@@ -0,0 +1,64 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace ContentFiltering.Office.Word.Cleaners
+{
+ ///
+ /// Corrects the <img> and <br> tags generated by Word.
+ ///
+ class CorrectTagsClosingCleaner : IHTMLCleaner
+ {
+ private string tagName;
+
+ ///
+ /// Cleaner constructor.
+ ///
+ /// The name if the tag. Eg: "img", "br".
+ public CorrectTagsClosingCleaner(string tagName)
+ {
+ this.tagName = tagName;
+ }
+
+ #region IHTMLCleaner Members
+
+ ///
+ /// Corrects the img and br tags generated by Word.
+ ///
+ /// The html source to be corrected.
+ /// The corrected html coe.
+ public string Clean(string htmlSource)
+ {
+ //The string builder will be appendend when more then 1000 corrupted tags are found.
+ int slack = 1000;
+ string correctionString = " /";
+ string searchedString = "<" + tagName;
+ StringBuilder sb = new StringBuilder(htmlSource.Length + slack);
+ sb.Insert(0, htmlSource);
+ int startIndex = 0;
+ int endIndex = 0;
+ int nonValidTags = 0;
+ do
+ {
+ startIndex = htmlSource.IndexOf(searchedString, endIndex);
+ if (startIndex >= 0)
+ {
+ endIndex = htmlSource.IndexOf('>', startIndex);
+ if (endIndex > 0)
+ {
+ //The tag is missing the '/' before the '>' character
+ if (!(htmlSource[endIndex - 1].CompareTo('/') == 0))
+ {
+ sb.Insert(endIndex + nonValidTags * correctionString.Length, correctionString);
+ nonValidTags++;
+ }
+ }
+ }
+ } while (startIndex < (htmlSource.Length - 1) && endIndex < (htmlSource.Length - 1) && (startIndex >= 0) && (endIndex >= 0));
+ return sb.ToString();
+ }
+
+ #endregion IHTMLCleaner Members
+ }
+}
Index: xword/ContentFiltering/Office/Word/Cleaners/DoctypeRemover.cs
===================================================================
--- xword/ContentFiltering/Office/Word/Cleaners/DoctypeRemover.cs (revision 0)
+++ xword/ContentFiltering/Office/Word/Cleaners/DoctypeRemover.cs (revision 0)
@@ -0,0 +1,31 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace ContentFiltering.Office.Word.Cleaners
+{
+ ///
+ /// Removes the doctype declaration from a given html code.
+ ///
+ public class DoctypeRemover : IHTMLCleaner
+ {
+ #region IHTMLCleaner Members
+
+ ///
+ /// Removes the doctype declaration from a given html code.
+ ///
+ /// The original html code.
+ /// The modified html code.
+ public string Clean(string htmlCode)
+ {
+ int startIndex, endIndex;
+ startIndex = htmlCode.IndexOf("", startIndex);
+ return htmlCode.Remove(startIndex, endIndex - startIndex);
+ }
+
+ #endregion IHTMLCleaner Members
+
+ }
+}
Index: xword/ContentFiltering/Office/Word/Cleaners/EmptyParagraphsCleaner.cs
===================================================================
--- xword/ContentFiltering/Office/Word/Cleaners/EmptyParagraphsCleaner.cs (revision 0)
+++ xword/ContentFiltering/Office/Word/Cleaners/EmptyParagraphsCleaner.cs (revision 0)
@@ -0,0 +1,29 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace ContentFiltering.Office.Word.Cleaners
+{
+ ///
+ /// Replaces empty paragraphs with line breaks ('<br/>').
+ ///
+ public class EmptyParagraphsCleaner : IHTMLCleaner
+ {
+ #region IHTMLCleaner Members
+
+ ///
+ /// Replaces empty paragraphs with line breaks ('<br/>').
+ ///
+ /// Initial HTML source.
+ /// Cleaned HTML source (empty paragraphs replaced with line breaks).
+ public string Clean(string htmlSource)
+ {
+ htmlSource = htmlSource.Replace("", "
");
+ htmlSource = htmlSource.Replace("
", "
");
+ return htmlSource;
+ }
+
+ #endregion IHTMLCleaner Members
+ }
+}
Index: xword/ContentFiltering/Office/Word/Cleaners/HeadSectionRemover.cs
===================================================================
--- xword/ContentFiltering/Office/Word/Cleaners/HeadSectionRemover.cs (revision 0)
+++ xword/ContentFiltering/Office/Word/Cleaners/HeadSectionRemover.cs (revision 0)
@@ -0,0 +1,37 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using XWiki.Html;
+
+namespace ContentFiltering.Office.Word.Cleaners
+{
+ ///
+ /// Removes the head section from an html source.
+ ///
+ public class HeadSectionRemover : IHTMLCleaner
+ {
+ private HtmlUtil htmlUtil;
+
+ ///
+ /// Default constructor.
+ ///
+ public HeadSectionRemover()
+ {
+ htmlUtil = new HtmlUtil();
+ }
+ #region IHTMLCleaner Members
+
+ ///
+ /// Removes the head section from an html source.
+ ///
+ /// The HTML source.
+ /// The HTML source without the head section.
+ public string Clean(string htmlSource)
+ {
+ return htmlUtil.RemoveSpecificTagContent(htmlSource, "", "");
+ }
+
+ #endregion IHTMLCleaner Members
+ }
+}
Index: xword/ContentFiltering/Office/Word/Cleaners/IHTMLCleaner.cs
===================================================================
--- xword/ContentFiltering/Office/Word/Cleaners/IHTMLCleaner.cs (revision 0)
+++ xword/ContentFiltering/Office/Word/Cleaners/IHTMLCleaner.cs (revision 0)
@@ -0,0 +1,15 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace ContentFiltering.Office.Word.Cleaners
+{
+ ///
+ /// Interface for HTML cleaners (pre-DOM filters).
+ ///
+ public interface IHTMLCleaner
+ {
+ string Clean(string htmlSource);
+ }
+}
Index: xword/ContentFiltering/Office/Word/Cleaners/ListCharsCleaner.cs
===================================================================
--- xword/ContentFiltering/Office/Word/Cleaners/ListCharsCleaner.cs (revision 0)
+++ xword/ContentFiltering/Office/Word/Cleaners/ListCharsCleaner.cs (revision 0)
@@ -0,0 +1,30 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace ContentFiltering.Office.Word.Cleaners
+{
+ ///
+ /// Replaces some characters used by MS Word for bullet lists with 'o' characters.
+ ///
+ public class ListCharsCleaner : IHTMLCleaner
+ {
+ #region IHTMLCleaner Members
+
+ ///
+ /// Replaces some characters used by MS Word for bullet lists (like ·
)
+ /// with 'o' characters.
+ ///
+ /// Initial HTML source.
+ /// Cleaned HTML source.
+ public string Clean(string htmlSource)
+ {
+ htmlSource = htmlSource.Replace('·', 'o');
+ htmlSource = htmlSource.Replace('§', 'o');
+ return htmlSource;
+ }
+
+ #endregion IHTMLCleaner Members
+ }
+}
Index: xword/ContentFiltering/Office/Word/Cleaners/NbspBetweenTagsRemover.cs
===================================================================
--- xword/ContentFiltering/Office/Word/Cleaners/NbspBetweenTagsRemover.cs (revision 0)
+++ xword/ContentFiltering/Office/Word/Cleaners/NbspBetweenTagsRemover.cs (revision 0)
@@ -0,0 +1,27 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace ContentFiltering.Office.Word.Cleaners
+{
+ ///
+ /// Removes the between tags.
+ ///
+ public class NbspBetweenTagsRemover:IHTMLCleaner
+ {
+ #region IHTMLCleaner Members
+
+ ///
+ /// Removes the between tags.
+ ///
+ /// Initial HTML source.
+ /// Cleaned HTML.
+ public string Clean(string htmlSource)
+ {
+ return htmlSource.Replace("> <", "><");
+ }
+
+ #endregion IHTMLCleaner Members
+ }
+}
Index: xword/ContentFiltering/Office/Word/Cleaners/NbspReplacer.cs
===================================================================
--- xword/ContentFiltering/Office/Word/Cleaners/NbspReplacer.cs (revision 0)
+++ xword/ContentFiltering/Office/Word/Cleaners/NbspReplacer.cs (revision 0)
@@ -0,0 +1,27 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace ContentFiltering.Office.Word.Cleaners
+{
+ ///
+ /// Removes the between tags.
+ ///
+ public class NbspReplacer : IHTMLCleaner
+ {
+ #region IHTMLCleaner Members
+
+ ///
+ /// Removes the between tags.
+ ///
+ /// Initial HTML source.
+ /// Cleaned HTML.
+ public string Clean(string htmlSource)
+ {
+ return htmlSource.Replace(" ", " ");
+ }
+
+ #endregion IHTMLCleaner Members
+ }
+}
Index: xword/ContentFiltering/Office/Word/Cleaners/OfficeNameSpacesTagsRemover.cs
===================================================================
--- xword/ContentFiltering/Office/Word/Cleaners/OfficeNameSpacesTagsRemover.cs (revision 0)
+++ xword/ContentFiltering/Office/Word/Cleaners/OfficeNameSpacesTagsRemover.cs (revision 0)
@@ -0,0 +1,46 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace ContentFiltering.Office.Word.Cleaners
+{
+ ///
+ /// Removes the tags that are in the office namespaces.
+ ///
+ public class OfficeNameSpacesTagsRemover : IHTMLCleaner
+ {
+ #region IHTMLCleaner Members
+
+ ///
+ /// Removes the tags that are in the office namespaces.
+ ///
+ /// The original content.
+ /// The cleaned content.
+ public string Clean(string htmlSource)
+ {
+ bool foundTags = false;
+ int startIndex = 0;
+ int endIndex = 0;
+ do
+ {
+ foundTags = false;
+ startIndex = htmlSource.IndexOf("= 0)
+ {
+ endIndex = htmlSource.IndexOf("= 0)
+ {
+ endIndex = htmlSource.IndexOf(">", endIndex + 1);
+ htmlSource = htmlSource.Remove(startIndex, endIndex - startIndex + 1);
+ }
+ foundTags = true;
+ startIndex = endIndex - (endIndex - startIndex + 1);
+ }
+ } while (foundTags);
+ return htmlSource;
+ }
+
+ #endregion IHTMLCleaner Members
+ }
+}
Index: xword/ContentFiltering/Office/Word/Cleaners/TidyHTMLCleaner.cs
===================================================================
--- xword/ContentFiltering/Office/Word/Cleaners/TidyHTMLCleaner.cs (revision 0)
+++ xword/ContentFiltering/Office/Word/Cleaners/TidyHTMLCleaner.cs (revision 0)
@@ -0,0 +1,70 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using TidyNet;
+using System.IO;
+using XWiki;
+
+namespace ContentFiltering.Office.Word.Cleaners
+{
+ ///
+ /// Uses Tidy.Net to clean a html source.
+ ///
+ public class TidyHTMLCleaner : IHTMLCleaner
+ {
+ private bool isWordHtml;
+
+ public TidyHTMLCleaner(bool isWordHtml)
+ {
+ this.isWordHtml = isWordHtml;
+ }
+ #region IHTMLCleaner Members
+ ///
+ /// Uses Tidy.Net to clean a html source.
+ ///
+ /// The original html source.
+ /// Specifies if the source is an output from Microsoft Word
+ /// The cleaned Html.
+ public string Clean(string htmlSource)
+ {
+ Tidy tidy = new Tidy();
+ //Options required dor xhtml conversion.
+ tidy.Options.DocType = DocType.Strict;
+ tidy.Options.DropFontTags = true;
+ tidy.Options.LogicalEmphasis = true;
+ tidy.Options.Xhtml = true;
+ tidy.Options.XmlOut = true;
+ tidy.Options.MakeClean = true;
+ tidy.Options.TidyMark = false;
+ tidy.Options.DropEmptyParas = true;
+ tidy.Options.IndentContent = true;
+ tidy.Options.SmartIndent = true;
+ tidy.Options.Word2000 = isWordHtml;
+ tidy.Options.EncloseBlockText = true;
+
+ tidy.Options.XmlTags = true;
+ tidy.Options.FixComments = true;
+ TidyMessageCollection tmc = new TidyMessageCollection();
+ MemoryStream input = new MemoryStream();
+ MemoryStream output = new MemoryStream();
+
+ byte[] byteArray = Encoding.UTF8.GetBytes(htmlSource);
+ input.Write(byteArray, 0, byteArray.Length);
+ input.Position = 0;
+ try
+ {
+ tidy.Parse(input, output, tmc);
+ }
+ catch (FormatException ex)
+ {
+ Log.Exception(ex);
+ return htmlSource;
+ }
+ string cleanContent = Encoding.UTF8.GetString(output.ToArray());
+ return cleanContent;
+ }
+
+ #endregion IHTMLCleaner Members
+ }
+}
Index: xword/ContentFiltering/Office/Word/Cleaners/XmlNamespaceDefinitionsReplacer.cs
===================================================================
--- xword/ContentFiltering/Office/Word/Cleaners/XmlNamespaceDefinitionsReplacer.cs (revision 0)
+++ xword/ContentFiltering/Office/Word/Cleaners/XmlNamespaceDefinitionsReplacer.cs (revision 0)
@@ -0,0 +1,75 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace ContentFiltering.Office.Word.Cleaners
+{
+ ///
+ /// Replaces the opening html tag with a given one.
+ ///
+ public class XmlNamespaceDefinitionsReplacer : IHTMLCleaner
+ {
+ private string newHtmlTag;
+
+ ///
+ /// Creates a new XmlNamespaceDefinitionsReplacer HTML cleaner (pre-DOM filter).
+ ///
+ /// Openening html tag that will replace the current one.
+ public XmlNamespaceDefinitionsReplacer(string newHtmlTag)
+ {
+ this.newHtmlTag = newHtmlTag;
+ }
+
+ #region IHTMLCleaner Members
+ ///
+ /// Replaces the opening html tag with a given one.
+ ///
+ /// The html source.
+ /// The new html tag.
+ /// Cleaned HTML source.
+ public string Clean(string htmlCode)
+ {
+ String oldHtmlTag = GetXmlNamespaceDefinitions(htmlCode);
+ if (oldHtmlTag == null)
+ {
+ if (!htmlCode.Contains("");
+ htmlCode = htmlCode.Insert(htmlCode.Length, "");
+ }
+ htmlCode = htmlCode.Insert(0, newHtmlTag);
+ htmlCode = htmlCode.Insert(htmlCode.Length, "