天天看點

C#異步采集

搞搞異步,采集下部落格園新聞的第一頁。

public async static void DoRun()
        {
            Stopwatch watch = new Stopwatch();
            watch.Start();
            HttpClient httpClient = new HttpClient();
            string url = "https://news.cnblogs.com/";
            Search search = new Search();
            List<Article> articles = new List<Article>();
            HtmlDocument doc = new HtmlDocument();
            var list = search.CrawlerCnblosNewsList(url, httpClient, doc);
            List<Task> taskList = new List<Task>();

            foreach (var item in list)
            {
                taskList.Add(search.CrawlerCnblosNews4(item, articles, httpClient, doc));
            }
            await Task.WhenAll(taskList.ToArray());
            //foreach (var article in articles)
            //{
            //    Console.WriteLine(article.ID);
            //    Console.WriteLine(article.Title);
            //    Console.WriteLine(article.Author);
            //    Console.WriteLine(article.CreateDate);
            //}
            //foreach (var item in taskList)
            //{
            //    Console.WriteLine(item.IsCompleted ? "Yes" : "No");
            //}

            watch.Stop();
            Console.WriteLine("異步方法M4-" + articles.Count + "-" + watch.Elapsed.TotalMilliseconds);

        }
            public async Task CrawlerCnblosNews4(string ID, List<Article> articles, HttpClient httpClient, HtmlDocument doc)
            {
                Console.WriteLine(ID + " - CrawlerCnblosNews4 - " + Thread.CurrentThread.ManagedThreadId.ToString("00"));
                string url = url = "https://news.cnblogs.com" + ID;
                HttpResponseMessage responseArticle = await httpClient.GetAsync(new Uri(url));
                responseArticle.EnsureSuccessStatusCode();
                String resultArticle = await responseArticle.Content.ReadAsStringAsync();
                //*[@id="news_title"]
                //*[@id="news_info"]/span[1]/a
                //*[@id="news_info"]/span[2]
                //*[@id="news_body"]
                doc = new HtmlDocument();
                doc.LoadHtml(resultArticle);
                string title = doc.DocumentNode.SelectSingleNode(@"//*[@id='news_title']").InnerText;
                string author = doc.DocumentNode.SelectSingleNode(@"//*[@id='news_info']/span[1]/a").InnerHtml;
                string createDate = doc.DocumentNode.SelectSingleNode(@"//*[@id='news_info']/span[2]").InnerText;
                //var node = doc.DocumentNode.SelectSingleNode(@"//*[@class='topic_img']");
                //if (node != null)
                //{
                //    doc.DocumentNode.SelectSingleNode(@"//*[@class='topic_img']").ParentNode.Remove();
                //}
                //string content = doc.DocumentNode.SelectSingleNode(@"//*[@id='news_body']").InnerHtml;
                //Console.WriteLine(title);
                //Console.WriteLine(author);
                //Console.WriteLine(createDate);
                //Console.WriteLine(content);

                articles.Add(new Article
                {
                    ID = ID,
                    Title = title,
                    Author = author,
                    CreateDate = createDate
                });
            }
    public class Article
    {
        public string ID { get; set; }
        public string Title { get; set; }
        public string Content { get; set; }
        public string Author { get; set; }
        public string CreateDate { get; set; }
    }