abot是.net爬蟲架構中的一種,Abot是一個開源的.net爬蟲,速度快,易于使用和擴充。項目的位址是https://code.google.com/p/abot/。
爬取的html解析,我們使用AngleSharp,項目的位址:https://github.com/AngleSharp/AngleSharp
首先我們需要配置abot
private static readonly Uri FeedUrl = new Uri("https://www.jd.com/allSort.aspx");//定義一個爬取的url,這裡以京東商品的分類為例子
public static IWebCrawler GetManuallyConfiguredWebCrawler()
{
//這裡進行配置,具體的含義自己看源代碼了解
CrawlConfiguration config = new CrawlConfiguration();
config.MaxConcurrentThreads = System.Environment.ProcessorCount;
config.MaxPagesToCrawl = 1000;
config.IsExternalPageCrawlingEnabled = false;
config.IsUriRecrawlingEnabled = false;
config.IsExternalPageLinksCrawlingEnabled = false;
config.IsRespectRobotsDotTextEnabled = false;
config.DownloadableContentTypes = "text/html, text/plain";
config.MinCrawlDelayPerDomainMilliSeconds = 1000;
config.CrawlTimeoutSeconds = 0;
config.MaxPagesToCrawlPerDomain = 0;
var crawler = new PoliteWebCrawler(config, null, null, null, null, null, null, null, null);
//爬取頁面前的判斷
crawler.ShouldCrawlPage(ShouldCrawlPage);
crawler.ShouldDownloadPageContent(ShouldDownloadPageContent);
crawler.ShouldCrawlPageLinks(ShouldCrawlPageLinks);
//下面是爬取的四個事件
crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;//單個頁面爬取開始
crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompletedAsync;//單個頁面爬取結束
// crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;// 頁面連結不允許爬取事件
//crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;//頁面不允許爬取事件
return crawler;
}
爬蟲中主要是4個事件, 頁面爬取開始、頁面爬取失敗、頁面不允許爬取事件、頁面中的連結不允許爬取事件.
以下是示例
//單個頁面爬取開始
public static void crawler_ProcessPageCrawlStarting(object sender, PageCrawlStartingArgs e)
{
PageToCrawl pageToCrawl = e.PageToCrawl;
}
//單個頁面爬取結束
public static void crawler_ProcessPageCrawlCompletedAsync(object sender, PageCrawlCompletedArgs e)
{
if (e.CrawledPage.Uri == FeedUrl)
{
StringBuilder sb=new StringBuilder();
//這裡使用AngleSharp解析html
var all=e.CrawledPage.AngleSharpHtmlDocument.QuerySelector(".category-items").Children;
foreach (var col in all)
{
var categorys=col.QuerySelectorAll(".category-item");
foreach (var category in categorys)
{
var first=category.QuerySelector(".item-title span").Text();
sb.Append("\r\n" + first + "\r\n");
var seconds = category.QuerySelector(".items").Children;
foreach (var second in seconds)
{
var secondtext=second.QuerySelector("dt a").Text();
sb.Append(secondtext + "\t");
var thireds = second.QuerySelector("dd").Children;
foreach (var thired in thireds)
{
var thiredtext = thired.Text();
sb.Append(thiredtext + ",");
}
sb.Remove(sb.Length - 1, 1);
}
}
}
//爬取的資料儲存到C:\Program Files (x86)\IIS Express下面。注意這裡儲存可能需要以管理者的身份運作VS
System.IO.File.AppendAllText("fake.txt", sb.ToString());
}
}
#region
/// <summary>
/// 同步方法注冊一個委托,以确定是否應該抓取一個頁面
/// </summary>
/// <param name="pageToCrawl"></param>
/// <param name="crawlContext"></param>
/// <returns></returns>
public static CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext)
{
if (pageToCrawl.IsRetry || pageToCrawl.IsRoot || FeedUrl == pageToCrawl.Uri )//判斷是否為根Url,爬取的Url是否為我們指定的
{
return new CrawlDecision() { Allow = true };
}
else
{
return new CrawlDecision { Allow = false, Reason = "Not match uri" };//如果為false,就不爬取頁面
}
}
/// <summary>
/// 同步方法注冊一個委托,以确定頁面的内容是否應該被加載
/// </summary>
/// <param name="pageToCrawl"></param>
/// <param name="crawlContext"></param>
/// <returns></returns>
private static CrawlDecision ShouldDownloadPageContent(PageToCrawl pageToCrawl, CrawlContext crawlContext)
{
if (pageToCrawl.IsRoot || pageToCrawl.IsRetry || FeedUrl == pageToCrawl.Uri)
{
return new CrawlDecision
{
Allow = true
};
}
return new CrawlDecision { Allow = false, Reason = "Not match uri" };
}
/// <summary>
/// 同步方法注冊一個委托,以确定是否應該抓取一個頁面的連結
/// </summary>
/// <param name="crawledPage"></param>
/// <param name="crawlContext"></param>
/// <returns></returns>
private static CrawlDecision ShouldCrawlPageLinks(CrawledPage crawledPage, CrawlContext crawlContext)
{
if (!crawledPage.IsInternal)
return new CrawlDecision { Allow = false, Reason = "We dont crawl links of external pages" };
if (crawledPage.IsRoot || crawledPage.IsRetry || crawledPage.Uri == FeedUrl)
{
return new CrawlDecision { Allow = true };
}
else
{
return new CrawlDecision { Allow = false, Reason = "We only crawl links of pagination pages" };
}
}
#endregion
接下來就是測試
public ActionResult Index()
{
var crawler = GetManuallyConfiguredWebCrawler();
var reuslt = crawler.Crawl(FeedUrl);
Response.Write(reuslt.ErrorException);
return View();
}