Getting a substring of text containing HTML tags is more tricky than you think. Assume that you want the first 10 characters of the following:
"<p>this is paragraph 1<p><p>this is paragraph 2</p>"
The output would be:
"<p>this is"As you can see, the returned text contains an unclosed P tag. If this is rendered to a page, subsequent content will be affected by the open P tag. Ideally, the preferred output would close any unclosed HTML tags in reverse of when they were opened:
"<p>this is</p>"
Here is a function that returns a subtring of HTML, making sure that no tags are left unclosed:
using System; using System.Collections.Generic; public static class StringUtility { public static string HTMLSubstring( string html, int length ) { if ( html == null ) { throw new ArgumentNullException( "html" ); } List<string> unclosedTags = new List<string>(); bool isQuoted = false; if ( html.Length > length ) { for ( int i = 0; i < html.Length; i++ ) { char currentCharacter = html[i]; char nextCharacter = ' '; if ( i < html.Length - 1 ) { nextCharacter = html[i + 1]; } // Check if quotes are on. if ( !isQuoted ) { if ( currentCharacter == '<' && nextCharacter != ' ' && nextCharacter != '>' ) { if ( nextCharacter != '/' ) // Open tag. { int startIndex = i + 1; if ( startIndex < html.Length ) { int finishIndex = html.IndexOf( ">", startIndex ); if ( finishIndex > 0 ) { if ( html[finishIndex - 1] != '/' ) { string tag = html.Substring( startIndex, finishIndex - startIndex ); if ( tag.Contains( " " ) ) { int temporaryFinishIndex = html.IndexOf( " ", startIndex ); tag = html.Substring( startIndex, temporaryFinishIndex - startIndex ); } if ( !tag.Equals( "br", StringComparison.InvariantCultureIgnoreCase ) ) { unclosedTags.Add( tag ); } } int tagLength = finishIndex + 1 - i; length += tagLength; i = finishIndex; } } } else if ( nextCharacter == '/' ) // Close tag. { int startIndex = i + 2; if ( startIndex < html.Length ) { int finishIndex = html.IndexOf( ">", startIndex ); if ( finishIndex > 0 ) { string tag = html.Substring( startIndex, finishIndex - startIndex ); // FILO. int index = unclosedTags.LastIndexOf( tag ); if ( index >= 0 ) { unclosedTags.RemoveAt( index ); int tagLength = finishIndex + 1 - i; length += tagLength; i = finishIndex; } } } } } } else { if ( currentCharacter == '"' ) { isQuoted = false; } } if ( i >= length ) { html = string.Format( "{0}...", html.Substring( 0, i ) ); unclosedTags.Reverse(); foreach ( string unclosedTag in unclosedTags ) { html += string.Format( "</{0}>", unclosedTag ); } } } } return html; } }Source by : - http://andrewgunn.blogspot.com/2008/06/html-substring-in-cnet.html
No comments:
Post a Comment