這兩天在做資料采集,是以整理了下資料采集要用到的一些方法。因為我采集的資料比較簡單,是以沒有用到架構。比較有名的兩個架構 HtmlAgilityPack 和 Jumony,感興趣的可以研究下。當然,火車頭采集工具也很友善,不過要付費。下面是整理的代碼:
/// <summary>
/// Html正則處理幫助類
/// </summary>
public class HtmlRegex
{
/// <summary>
/// 比對所有Html标簽
/// </summary>
const string HTMLALLTAG = @"<[^>]+>|</[^>]+>";
/// <summary>
/// 删除所有html标簽
/// </summary>
/// <param name="content">原HTML代碼</param>
/// <returns></returns>
public static string RemoveAllHtml(string content)
{
return Regex.Replace(content, HTMLALLTAG, "");
}
/// <summary>
/// 根據正則比對擷取指定内容
/// </summary>
/// <param name="regStr">正則</param>
/// <param name="content">原HTML代碼</param>
/// <param name="hashtml">是否包含HTML标簽</param>
/// <returns></returns>
public static string GetStrByRegex(string regStr, string content, bool hashtml = true)
{
string result = string.Empty;
Regex reg = new Regex(regStr);
Match mth = reg.Match(content);
if (mth.Success)
{
result = mth.Value;
if (!hashtml) result = HtmlRegex.RemoveAllHtml(result); //去除html标簽
}
return result;
}
/// <summary>
/// 擷取指定位置的html代碼
/// </summary>
/// <param name="start">起始字元串</param>
/// <param name="end">結束字元串</param>
/// <param name="content">原HTML代碼</param>
/// <param name="hasHtml">是否包含HTML标簽</param>
/// <returns></returns>
public static string GetStrByRegex(string start, string end, string content, bool hasHtml = true)
{
string result = string.Empty;
string regStr = @"(?is)(" + start + ").*?(" + end + ")";
Regex reg = new Regex(regStr);
Match mth = reg.Match(content);
if (mth.Success)
{
result = mth.Value;
if (!hasHtml) result = HtmlRegex.RemoveAllHtml(result); //去除html标簽
}
return result;
}
/// <summary>
/// 擷取比對的字元串清單
/// </summary>
/// <param name="regStr">正則</param>
/// <param name="content">原HTML代碼</param>
/// <returns></returns>
public static List<string> GetStrListByRegex(string regStr, string content)
{
List<string> strList = null;
MatchCollection mc = null;
try
{
Regex reg = new Regex(regStr);
mc = reg.Matches(content);
if (mc.Count > 0)
{
strList = new List<string>();
for (int i = 0; i < mc.Count; i++)
{
strList.Add(mc[i].Value);
}
}
}
catch
{
strList = null;
}
return strList;
}
/// <summary>
/// 擷取比對的字元串清單
/// </summary>
/// <param name="start">起始字元串</param>
/// <param name="end">結束字元串</param>
/// <param name="content">原HTML代碼</param>
/// <returns></returns>
public static List<string> GetStrListByRegex(string start, string end, string content)
{
List<string> strList = null;
MatchCollection mc = null;
string regStr = @"(?is)(" + start + ").*?(" + end + ")";
try
{
Regex reg = new Regex(regStr);
mc = reg.Matches(content);
if (mc.Count > 0)
{
strList = new List<string>();
for (int i = 0; i < mc.Count; i++)
{
strList.Add(mc[i].Value);
}
}
}
catch
{
strList = null;
}
return strList;
}
}