Showing posts with label HTML table to DataSet. Show all posts
Showing posts with label HTML table to DataSet. Show all posts

Friday, September 24, 2010

Convert HTML table to DataSet while downloading data from another site

Convert HTML table to DataSet while downloading data from another site


Just imagine a situation while you are downloading the data from another site using

System.Net.WebClient client = new System.Net.WebClient();

  string st = client.DownloadString(strURL);

or directly if you find a HTML table there and you want to add that table to dataset for further processing.this function will help you for that using regular expressions.


private DataSet ConvertHTMLTablesToDataSet(String HTML)

{

DataSet ds = new DataSet();

try

{



DataTable dt = new DataTable();

DataRow dr;

DataColumn dc;

string TableExpression = "<table[^>]*>(.*?)</table>";

string HeaderExpression = "<th[^>]*>(.*?)</th>";

string RowExpression = "<tr[^>]*>(.*?)</tr>";

string ColumnExpression = "<td[^>]*>(.*?)</td>";

bool HeadersExist = false;

int iCurrentColumn = 0;

int iCurrentRow = 0;

//Get a match for all the tables in the HTML

MatchCollection Tables = Regex.Matches(HTML, TableExpression, RegexOptions.IgnoreCase);

//Loop through each table element

foreach (Match Table in Tables)

  {

  // Reset the current row counter and the header flag

  iCurrentRow = 0;

  HeadersExist = false;

  //Add a new table to the DataSet

  dt = new DataTable();

  // Create the relevant amount of columns for this table (use the headers if they exist, otherwise use default names)

if (Table.Value.ToString().Contains("<th"))

  {

  // Set the HeadersExist flag

HeadersExist = true;

  // Get a match for all the rows in the table

  MatchCollection Headers = Regex.Matches(Table.Value, HeaderExpression, RegexOptions.IgnoreCase);

  // Loop through each header element

  foreach (Match Header in Headers)

  {

  dt.Columns.Add(Header.Groups[1].ToString());

  }

  }

  else

  {

  for (int iColumns = 1; iColumns <= Regex.Matches(Regex.Matches(Regex.Matches(Table.Value, TableExpression, RegexOptions.IgnoreCase)[0].Value.ToString(), RowExpression, RegexOptions.IgnoreCase)[0].Value.ToString(), ColumnExpression, RegexOptions.IgnoreCase).Count; iColumns++)

  {

  dt.Columns.Add("Column " + iColumns);

  }

}

//Get a match for all the rows in the table

  MatchCollection Rows = Regex.Matches(Table.Value, RowExpression, RegexOptions.IgnoreCase);

  //Loop through each row element

  foreach (Match Row in Rows)

  {

  //Only loop through the row if it isn't a header row

  if (!(iCurrentRow == 0 && HeadersExist == true))

  {

  //Create a new row and reset the current column counter

  dr = dt.NewRow();

  iCurrentColumn = 0;

  // Get a match for all the columns in the row

  MatchCollection Columns = Regex.Matches(Row.Value, ColumnExpression, RegexOptions.IgnoreCase);

  // Loop through each column element

  foreach (Match Column in Columns)

  {

  //Add the value to the DataRow

  dr[iCurrentColumn] = Column.Groups[1].ToString();

  //Increase the current column

  iCurrentColumn += 1;

  }

dt.Rows.Add(dr);

//Add the DataRow to the DataTable

}

  // Increase the current row counter

  iCurrentRow += 1;

  }

// Add the DataTable to the DataSet

ds.Tables.Add(dt);

  }

  }

  catch { }

  finally { }

  return ds;

  }