搞搞異步,采集下部落格園新聞的第一頁。
public async static void DoRun()
{
Stopwatch watch = new Stopwatch();
watch.Start();
HttpClient httpClient = new HttpClient();
string url = "https://news.cnblogs.com/";
Search search = new Search();
List<Article> articles = new List<Article>();
HtmlDocument doc = new HtmlDocument();
var list = search.CrawlerCnblosNewsList(url, httpClient, doc);
List<Task> taskList = new List<Task>();
foreach (var item in list)
{
taskList.Add(search.CrawlerCnblosNews4(item, articles, httpClient, doc));
}
await Task.WhenAll(taskList.ToArray());
//foreach (var article in articles)
//{
// Console.WriteLine(article.ID);
// Console.WriteLine(article.Title);
// Console.WriteLine(article.Author);
// Console.WriteLine(article.CreateDate);
//}
//foreach (var item in taskList)
//{
// Console.WriteLine(item.IsCompleted ? "Yes" : "No");
//}
watch.Stop();
Console.WriteLine("異步方法M4-" + articles.Count + "-" + watch.Elapsed.TotalMilliseconds);
}
public async Task CrawlerCnblosNews4(string ID, List<Article> articles, HttpClient httpClient, HtmlDocument doc)
{
Console.WriteLine(ID + " - CrawlerCnblosNews4 - " + Thread.CurrentThread.ManagedThreadId.ToString("00"));
string url = url = "https://news.cnblogs.com" + ID;
HttpResponseMessage responseArticle = await httpClient.GetAsync(new Uri(url));
responseArticle.EnsureSuccessStatusCode();
String resultArticle = await responseArticle.Content.ReadAsStringAsync();
//*[@id="news_title"]
//*[@id="news_info"]/span[1]/a
//*[@id="news_info"]/span[2]
//*[@id="news_body"]
doc = new HtmlDocument();
doc.LoadHtml(resultArticle);
string title = doc.DocumentNode.SelectSingleNode(@"//*[@id='news_title']").InnerText;
string author = doc.DocumentNode.SelectSingleNode(@"//*[@id='news_info']/span[1]/a").InnerHtml;
string createDate = doc.DocumentNode.SelectSingleNode(@"//*[@id='news_info']/span[2]").InnerText;
//var node = doc.DocumentNode.SelectSingleNode(@"//*[@class='topic_img']");
//if (node != null)
//{
// doc.DocumentNode.SelectSingleNode(@"//*[@class='topic_img']").ParentNode.Remove();
//}
//string content = doc.DocumentNode.SelectSingleNode(@"//*[@id='news_body']").InnerHtml;
//Console.WriteLine(title);
//Console.WriteLine(author);
//Console.WriteLine(createDate);
//Console.WriteLine(content);
articles.Add(new Article
{
ID = ID,
Title = title,
Author = author,
CreateDate = createDate
});
}
public class Article
{
public string ID { get; set; }
public string Title { get; set; }
public string Content { get; set; }
public string Author { get; set; }
public string CreateDate { get; set; }
}