Introduction
An interesting problem is parsing a Markup Document to represent it as an Object. This would be very helpful for ex. if you want to generate valid Markup Code ex: Placing quotes in HTML attribute values and placing the end tags also another helpful thing for it is to replace some element section traverse through the object model for making whatever needed logic would be much easier and usable in an Object Model.
Here I place a project I made that takes any markup and turn it in form of a MarkupDocument with elements and attributes,content etc. Below is a screenshot for a representation or a markup in a tree format in which I used the MarkupDocument to build the Tree.
Using the code
Here is the class Diagram for the Markup Representation
The MarkupDocument Class contains the whole document consisting of ChildElements and Content and Method for Parsing the Markup String Given to the object representation.
The ToString Method is overrided in all classes to provide the Markup of the represented Element in a text format.
Below is the method I made for parsing the Markup its a Static Method that takes a MarkupDocument and a markup string and loads in the document the parsed string.
Here I use regular expressions for parsing the documents like retrieving the element name the attributes etc.
public static void ParseString(MarkupDocument document, string markup)
{
List<STRING> result = new List<STRING>();
document.ChildElements.Clear();
Regex r;
Match m;
string[] markups = markup.Split('<');
MarkupElement parentElement = null;
foreach (string str in markups)
{
string workingMarkup = str;
if (str.Trim().Length == 0)
continue;
#region Closing Tag
//Check if this is a closing element or not
if (workingMarkup.TrimStart().StartsWith("/"))
{
//Check if a parent element exists or not
if (parentElement != null)
{
//Navigate up one level
if (document.IsSpecial(workingMarkup, ">", 1))
{
document.InsertContent(parentElement, workingMarkup, ">", 1);
continue;
}
parentElement = parentElement.ParentElement;
//Insert Markup in the parentElement content
document.InsertContent(parentElement, workingMarkup, ">", 1);
}
else
{
if (document.IsSpecial(workingMarkup, ">", 1))
{
document.InsertContent(workingMarkup, ">", 1);
continue;
}
//Adding an Element in case a closing tag in the beginning of the document
#region Adding The Element
r = new Regex("^\\s*\\w*", RegexOptions.IgnoreCase
| RegexOptions.Compiled);
m = r.Match(workingMarkup);
if (m.Success && m.Groups[0].Value.Trim().Length > 0)
{
MarkupElement initElement = new MarkupElement();
initElement.ParentElement = parentElement;
initElement.Name = m.Groups[0].Value;
initElement.Document = document;
initElement.IsSelfClosed = true;
document.ChildElements.Add(initElement);
}
#endregion
//Insert Markup in the document content
document.InsertContent(workingMarkup, ">", 1);
}
continue;
}
#endregion
MarkupElement currentElement = new MarkupElement();
currentElement.Document = document;
#region Element Name
currentElement.ParentElement = parentElement;
//This regular expression will extract the element name from the tag.
r = new Regex("^\\s*\\w*", RegexOptions.IgnoreCase | RegexOptions.Compiled);
m = r.Match(workingMarkup);
if (m.Success && m.Groups[0].Value.Trim().Length > 0)
currentElement.Name = m.Groups[0].Value;
else
continue;
workingMarkup = workingMarkup.Replace(currentElement.Name, "");
#endregion
#region Retrieve Element Attributes
//This regular expression will extract an attribute with its value at a time
r = new Regex("\\S*\\s*=\\s*\\S*(?:\"(?<1>[^\"]*)\"|(?<1>\\S+))",RegexOptions.IgnoreCase | RegexOptions.Compiled);
for (m = r.Match(workingMarkup); m.Success; m = m.NextMatch())
{
string tag = m.Groups[0].Value;
string[] tagSplit = tag.Split('=');
MarkupAttribute attribute = new MarkupAttribute();
attribute.Name = tagSplit[0];
attribute.Value = tagSplit[1];
currentElement.Attributes.Add(attribute);
}
#endregion
//Setting the element parent
currentElement.ParentElement = parentElement;
#region Add Element
if (parentElement == null)
document.ChildElements.Add(currentElement);
else
parentElement.ChildElements.Add(currentElement);
#endregion
#region Add Content
if (!str.Contains("/>"))
{
if (!document.SpecialElements.Contains(currentElement.Name))
{
parentElement = currentElement;
document.InsertContent(currentElement, workingMarkup, ">", 1);
}
else if (parentElement != null)
{
document.InsertContent(parentElement, workingMarkup, ">", 1);
}
else
{
document.InsertContent(workingMarkup, ">", 1);
}
}
else
{
currentElement.IsSelfClosed = true;
document.InsertContent(parentElement, workingMarkup, "/>", 2);
}
#endregion
}
}
Ok now lets assume you know that there are some special Markup Elements that are meant to be as single elements all you have to do is to add them to the SpecialElements List in the document object
MarkupLibrary.MarkupDocument document = new MarkupLibrary.MarkupDocument();
document.SpecialElements.Add("br");
document.SpecialElements.Add("hr");
document.SpecialElements.Add("img");
To Load the document with the Markup string call the Load Method
document.Load(markup);
Points of Interest
Finally I hope That this would help you to create your Object Model from a Markup Representation.