Friday, September 24, 2010

Convert HTML table to DataSet while downloading data from another site

Convert HTML table to DataSet while downloading data from another site


Just imagine a situation while you are downloading the data from another site using

System.Net.WebClient client = new System.Net.WebClient();

  string st = client.DownloadString(strURL);

or directly if you find a HTML table there and you want to add that table to dataset for further processing.this function will help you for that using regular expressions.


private DataSet ConvertHTMLTablesToDataSet(String HTML)

{

DataSet ds = new DataSet();

try

{



DataTable dt = new DataTable();

DataRow dr;

DataColumn dc;

string TableExpression = "<table[^>]*>(.*?)</table>";

string HeaderExpression = "<th[^>]*>(.*?)</th>";

string RowExpression = "<tr[^>]*>(.*?)</tr>";

string ColumnExpression = "<td[^>]*>(.*?)</td>";

bool HeadersExist = false;

int iCurrentColumn = 0;

int iCurrentRow = 0;

//Get a match for all the tables in the HTML

MatchCollection Tables = Regex.Matches(HTML, TableExpression, RegexOptions.IgnoreCase);

//Loop through each table element

foreach (Match Table in Tables)

  {

  // Reset the current row counter and the header flag

  iCurrentRow = 0;

  HeadersExist = false;

  //Add a new table to the DataSet

  dt = new DataTable();

  // Create the relevant amount of columns for this table (use the headers if they exist, otherwise use default names)

if (Table.Value.ToString().Contains("<th"))

  {

  // Set the HeadersExist flag

HeadersExist = true;

  // Get a match for all the rows in the table

  MatchCollection Headers = Regex.Matches(Table.Value, HeaderExpression, RegexOptions.IgnoreCase);

  // Loop through each header element

  foreach (Match Header in Headers)

  {

  dt.Columns.Add(Header.Groups[1].ToString());

  }

  }

  else

  {

  for (int iColumns = 1; iColumns <= Regex.Matches(Regex.Matches(Regex.Matches(Table.Value, TableExpression, RegexOptions.IgnoreCase)[0].Value.ToString(), RowExpression, RegexOptions.IgnoreCase)[0].Value.ToString(), ColumnExpression, RegexOptions.IgnoreCase).Count; iColumns++)

  {

  dt.Columns.Add("Column " + iColumns);

  }

}

//Get a match for all the rows in the table

  MatchCollection Rows = Regex.Matches(Table.Value, RowExpression, RegexOptions.IgnoreCase);

  //Loop through each row element

  foreach (Match Row in Rows)

  {

  //Only loop through the row if it isn't a header row

  if (!(iCurrentRow == 0 && HeadersExist == true))

  {

  //Create a new row and reset the current column counter

  dr = dt.NewRow();

  iCurrentColumn = 0;

  // Get a match for all the columns in the row

  MatchCollection Columns = Regex.Matches(Row.Value, ColumnExpression, RegexOptions.IgnoreCase);

  // Loop through each column element

  foreach (Match Column in Columns)

  {

  //Add the value to the DataRow

  dr[iCurrentColumn] = Column.Groups[1].ToString();

  //Increase the current column

  iCurrentColumn += 1;

  }

dt.Rows.Add(dr);

//Add the DataRow to the DataTable

}

  // Increase the current row counter

  iCurrentRow += 1;

  }

// Add the DataTable to the DataSet

ds.Tables.Add(dt);

  }

  }

  catch { }

  finally { }

  return ds;

  }

3 comments:

  1. Great example. Thanks. Was able to lift it with no changes :)

    ReplyDelete
  2. This comment has been removed by the author.

    ReplyDelete
  3. Excellent. Working perfect...

    ReplyDelete