An Xml class to linearize xml, make pretty xml, and encoding in UTF-8 or UTF-16.
UPDATE: This is now part of the EasyXml NuGet package. There is also an EasyXml.Sources NuGet package.
OK, so I had to output an Xml for something I am doing for work. I need the Xml to be pretty. I decided I wanted a C# class that would do the equivalent of what the Xml plugin in Notepad++ does. It should create Linear Xml, Pretty Xml, etc.
I found a bunch of problems when doing this in C#:
- How to make an Xml linearized in C#?
- How to make the Xml pretty, including indents and cleaned up spacing? I could get the indents, but the spacing was a problem until I solved how to linearize the Xml.
- How to make the Xml declaration say UTF-8? It kept saying the Xml was in UTF-16, which was accurate because the file was UTF-16. Strings in C# are always Unicode (UTF-16).
- How to make UTF-8 uppercase? Once I got the Xml declaration to say UTF-8, the UTF-8 text was lowercase, utf-8, instead of uppercase. Lowercase should work, but it turns out uppercase is preferred.
- How to output the Xml as a file in actual UTF-8 format? It is one thing to have UTF-8 in the Xml declaration, it is quite another to actually output a file byte stream in UTF-8 vs UTF-16 (Unicode). A UTF-8 file should be 1/2 the size of a UTF-16 file.
So here is my class. I hope it helps you. All the questions are answered by this class.
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using System.Xml;
using System.Xml.Linq;
namespace Rhyous.EasyXml
{
public class Xml
{
public Xml(string text)
{
Text = text;
}
/// <summary>
/// And enumeration to select UTF8 or UTF16 encoding. This is used because the defautl UTF8
/// and Unicode encoding types don't capitalize the UTF characters in the Xml declaration.
/// </summary>
public enum XmlEncoding
{
UTF8, // By being first it is the default
UTF16
};
/// <summary>
/// The original Xml text as is.
/// </summary>
public string Text { get; private set; }
/// <summary>
/// The Xml in a single line (no new lines or carriage returns). The data is trimmed and no more than a single space anywhere.
/// </summary>
public string LinearizeXml
{
get { return _LinearizeXml ?? (_LinearizeXml = Clean(Document, LinearizedSettings)); }
} private string _LinearizeXml;
/// <summary>
/// And XDocument representation of the Xml. It uses the Linearized Xml not the original text.
/// </summary>
public XDocument Document
{
get { return _Document ?? (_Document = XDocument.Parse(GetLinearizedXml(Text))); }
} private XDocument _Document;
/// <summary>
/// The Xml with each element properly indented on a separate line. The data is trimmed and no more than a single space anywhere.
/// </summary>
public string PrettyXml
{
get { return _PrettyXml ?? (_PrettyXml = Clean(Document, PrettySettings)); }
} private string _PrettyXml;
/// <summary>
/// An enum that specifies whether to use UTF-8 or Unicode (UTF-16).
/// Changing the encoding shouldn't change the original Text but pretty and linearized
/// versions of the Xml should change as well as the stream.
/// </summary>
public XmlEncoding Encoding { get; set; }
/// <summary>
/// A method that outputs the Xml as a stream. It outputs using the correct Encoding.
/// It isn't enough to write encoding="UTF-8" in the Xml declaration if the output file
/// is still UTF-16. Botht the labeling and the actually bits in the file should match.
/// </summary>
/// <returns>A file stream in the configured encoding.</returns>
public Stream ToStream()
{
return new MemoryStream(ToByteArray());
}
/// <summary>
/// This creates a byte array using the correct encoding.
///
/// Note: Naturally, UTF-8 has half as manay bytes as UTF-16, however,
/// if UTF-8 is n bytes, UTF-16 will be 2*N+2 bytes. This is because
/// "UTF-8" is five characters and "UTF-16" is six characters.
/// So a 100 byte UTF-8 file would be 202 bytes in UTF-16.
/// </summary>
/// <returns>A byte[] array of the Xml string in the configured encoding.</returns>
public byte[] ToByteArray()
{
return GetEncoding().GetBytes(PrettyXml ?? "");
}
/// <summary>
/// A method to get the current encoding based on the Enum value.
/// </summary>
/// <returns>The correct Encoding.</returns>
private Encoding GetEncoding()
{
switch (Encoding)
{
case XmlEncoding.UTF8:
return XmlUTF8Encoding.Instance;
case XmlEncoding.UTF16:
return XmlUnicode.Instance;
default:
return XmlUnicode.Instance;
}
}
/// <summary>
/// XmlWriterSettings for linearized Xml.
/// </summary>
private XmlWriterSettings LinearizedSettings
{
get
{
return new XmlWriterSettings
{
Encoding = GetEncoding(),
Indent = false,
NewLineOnAttributes = false
};
}
}
/// <summary>
/// XmlWriterSettings for Pretty Xml.
/// </summary>
private XmlWriterSettings PrettySettings
{
get
{
return new XmlWriterSettings
{
Encoding = GetEncoding(),
Indent = true,
IndentChars = string.IsNullOrEmpty(IndentCharacters) ? " " : IndentCharacters,
NewLineOnAttributes = false,
NewLineHandling = NewLineHandling.Replace
};
}
}
/// <summary>
/// The characters to use for indenting Pretty Xml
/// </summary>
public string IndentCharacters { get; set; }
/// <summary>
/// The method that uses XDocument to do make clean (pretty or linearized) Xml
/// </summary>
/// <param name="doc">The XDcoument version of the Xml.</param>
/// <param name="settings">The configured XmlWriterSettings.</param>
/// <returns>A pretty Xml string.</returns>
private string Clean(XDocument doc, XmlWriterSettings settings)
{
var sb = new StringBuilder();
var stringWriter = new StringWriterWithEncoding(sb, GetEncoding());
using (var xmlWriter = XmlWriter.Create(stringWriter, settings))
{
doc.Save(xmlWriter);
xmlWriter.Flush();
return sb.ToString();
}
}
/// <summary>
/// A method that uses Regex to linearize Xml. The regex replaces methods are used.
/// </summary>
/// <param name="text">The Xml text</param>
/// <returns>Linearized Xml string.</returns>
private string GetLinearizedXml(string text)
{
// Replace all white space with a single space
var halfclean = Regex.Replace(text, @"\s+", " ", RegexOptions.Singleline);
// Trim after >.
var clean75 = Regex.Replace(halfclean, @">\s+", ">");
// Trim before <
var fullclean = Regex.Replace(clean75, @"\s+<", "<");
return fullclean;
}
/// <summary>
/// This clas allows for the Xml to be created with the Xml declaration saying UTF-8
/// </summary>
private sealed class StringWriterWithEncoding : StringWriter
{
private readonly Encoding _Encoding;
public StringWriterWithEncoding(StringBuilder builder, Encoding encoding)
: base(builder)
{
_Encoding = encoding;
}
public override Encoding Encoding
{
get { return _Encoding; }
}
}
/// <summary>
/// This class makes the UTF-8 text in the Xml declaration show up capitalized.
/// </summary>
private sealed class XmlUTF8Encoding : UTF8Encoding
{
public override string WebName
{
get { return base.WebName.ToUpper(); }
}
public override string HeaderName
{
get { return base.HeaderName.ToUpper(); }
}
public override string BodyName
{
get { return base.BodyName.ToUpper(); }
}
public static XmlUTF8Encoding Instance
{
get { return _XmlUTF8Encoding ?? (_XmlUTF8Encoding = new XmlUTF8Encoding()); }
} private static XmlUTF8Encoding _XmlUTF8Encoding;
}
/// <summary>
/// This class makes the UTF-16 text in the Xml declaration show up capitalized.
/// </summary>
private sealed class XmlUnicode : UnicodeEncoding
{
public override string WebName
{
get { return base.WebName.ToUpper(); }
}
public override string HeaderName
{
get { return base.HeaderName.ToUpper(); }
}
public override string BodyName
{
get { return base.BodyName.ToUpper(); }
}
public static XmlUnicode Instance
{
get { return _XmlUnicode ?? (_XmlUnicode = new XmlUnicode()); }
} private static XmlUnicode _XmlUnicode;
}
}
}
And here are some Unit Tests.
using LANDesk.Licensing.WebServices.Model;
using Microsoft.VisualStudio.TestTools.UnitTesting;
using System.IO;
using System.Text;
namespace Rhyous.EasyXml.Tests
{
[TestClass]
public class XmlTests
{
public string LinearUtf8Xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><Person><FirstName>John</FirstName><MiddleName>Al Leon</MiddleName><LastName>Doe</LastName></Person>";
public string PrettyUtf8Xml =
@"<?xml version=""1.0"" encoding=""UTF-8""?>
<Person>
<FirstName>John</FirstName>
<MiddleName>Al Leon</MiddleName>
<LastName>Doe</LastName>
</Person>";
public string PrettyUtf8XmlWithTabs =
@"<?xml version=""1.0"" encoding=""UTF-8""?>
<Person>
<FirstName>John</FirstName>
<MiddleName>Al Leon</MiddleName>
<LastName>Doe</LastName>
</Person>";
public string UglyUtf8Xml =
@"<?xml version=""1.0""
encoding=""UTF-8""?>
<Person>
<FirstName>
John
</FirstName>
<MiddleName>
Al
Leon
</MiddleName>
<LastName>
Doe
</LastName>
</Person>";
public string LinearUtf16Xml = "<?xml version=\"1.0\" encoding=\"UTF-16\"?><Person><FirstName>John</FirstName><MiddleName>Al Leon</MiddleName><LastName>Doe</LastName></Person>";
public string PrettyUtf16Xml =
@"<?xml version=""1.0"" encoding=""UTF-16""?>
<Person>
<FirstName>John</FirstName>
<MiddleName>Al Leon</MiddleName>
<LastName>Doe</LastName>
</Person>";
public string UglyUtf16Xml =
@"<?xml version=""1.0""
encoding=""UTF-16""?>
<Person>
<FirstName>
John
</FirstName>
<MiddleName>
Al
Leon
</MiddleName>
<LastName>
Doe
</LastName>
</Person>";
[TestMethod]
public void TestMethodLinearize()
{
// Arrange
Xml xml = new Xml(PrettyUtf8Xml);
// Act
var actual = xml.LinearizeXml;
// Assert
Assert.AreEqual(LinearUtf8Xml, actual);
}
[TestMethod]
public void TestMethodPretty()
{
// Arrange
Xml xml = new Xml(LinearUtf8Xml);
// Act
var actual = xml.PrettyXml;
// Assert
Assert.AreEqual(PrettyUtf8Xml, actual);
}
[TestMethod]
public void TestMethodLinearizeUgly()
{
// Arrange
Xml xml = new Xml(UglyUtf8Xml);
// Act
var actual = xml.LinearizeXml;
// Assert
Assert.AreEqual(LinearUtf8Xml, actual);
}
[TestMethod]
public void TestMethodMakeUglyPretty()
{
// Arrange
Xml xml = new Xml(UglyUtf8Xml);
// Act
var actual = xml.PrettyXml;
// Assert
Assert.AreEqual(PrettyUtf8Xml, actual);
}
[TestMethod]
public void TestMethodLinearizeUglyUtf16()
{
// Arrange
Xml xml = new Xml(UglyUtf16Xml)
{
Encoding = Xml.XmlEncoding.UTF16
};
// Act
var actual = xml.LinearizeXml;
// Assert
Assert.AreEqual(LinearUtf16Xml, actual);
}
[TestMethod]
public void TestMethodMakeUglyPrettyUtf16()
{
// Arrange
Xml xml = new Xml(UglyUtf16Xml)
{
Encoding = Xml.XmlEncoding.UTF16
};
// Act
var actual = xml.PrettyXml;
// Assert
Assert.AreEqual(PrettyUtf16Xml, actual);
}
[TestMethod]
public void TestMethodStreamIsUtf8()
{
// Arrange
Xml xml = new Xml(UglyUtf8Xml)
{
Encoding = Xml.XmlEncoding.UTF8
};
// Act
var actual = xml.ToStream();
using (var memoryStream = new MemoryStream())
{
actual.CopyTo(memoryStream);
var bytes = memoryStream.ToArray();
// Assert
Assert.AreEqual(154, bytes.Length);
}
}
[TestMethod]
public void TestMethodStreamIsUtf16()
{
// Arrange
Xml xml = new Xml(UglyUtf16Xml)
{
Encoding = Xml.XmlEncoding.UTF16
};
// Act
var actual = xml.ToStream();
using (var memoryStream = new MemoryStream())
{
actual.CopyTo(memoryStream);
var bytes = memoryStream.ToArray();
// Assert
// 310 is twice the size of 154, 308, but add 2 bytes because
// UTF-8 is 5 characters but UTF-16 is 6 characters so it is
// one character longer.John
Assert.AreEqual(310, bytes.Length);
}
}
[TestMethod]
public void TestMethodPrettyWuthTabs()
{
// Arrange
Xml xml = new Xml(LinearUtf8Xml)
{
IndentCharacters = "\t"
};
// Act
var actual = xml.PrettyXml;
// Assert
Assert.AreEqual(PrettyUtf8XmlWithTabs, actual);
}
[TestMethod]
public void TestMethodStreamUtf8IsDifferentThanStreamUtf16()
{
const string text = "Hello, world!";
var utf8 = Encoding.UTF8.GetBytes(text);
var utf16 = Encoding.Unicode.GetBytes(text);
Assert.AreNotEqual(utf8.Length, utf16.Length);
}
}
}


OUTSTANDING! Thank you!
Works like a charm! Thanks for sharing dude
I have many problems to change the language at the root of FreeBSD system and xorg. Also I have problems with the configuration of the correct language in the keyboard in the xorg of freebsd. As I proceed to change the encoding in German?
!Greetings!