[C#爬蟲_HtmlAgilityPack使用]_如何透過C#爬蟲批量將當前網頁圖片全下載下來



一個網頁上若要去捕抓下載所有png , jpg , gif的圖檔
此時就要透過網頁爬蟲(web crawler)


在此我們透過C#搭配vs2019 和HtmlAgilityPack這個套件進行開發
HtmlAgilityPack套件
https://html-agility-pack.net/

授權採用 MIT license

nuget上也可直接配置安裝


首先aspx網頁介面部分

WebForm1.aspx 程式碼

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
<%@ Page Language="C#" AutoEventWireup="true" CodeBehind="WebForm1.aspx.cs" Inherits="ScrapApp.WebForm1" %>

<!DOCTYPE html>

<html xmlns="http://www.w3.org/1999/xhtml">
<head runat="server">
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
    <title></title>
    <style type="text/css">
        #TextArea1 {
            height: 458px;
            width: 976px;
        }
    </style>
</head>
<body>
    <form id="form1" runat="server">
        <div>
            <asp:Label ID="Label1" runat="server" Text="URL:"></asp:Label>            
            <asp:TextBox ID="txtURL" runat="server"></asp:TextBox>
            <br/>
            <asp:DropDownList ID="ddl_action" runat="server">
                <asp:ListItem Value="1">爬取html文本</asp:ListItem>
                <asp:ListItem Value="2">爬取圖片連結</asp:ListItem>
            </asp:DropDownList>
            <asp:Button ID="btnScrap" runat="server" Text="web scrap" OnClick="btnScrap_Click" />
            <br/>
            <asp:Button ID="btnBatchDownload" runat="server" Text="Download Images" OnClick="btnBatchDownload_Click"/>
            <br />
            <textarea id="TextArea1" runat="server"></textarea>
        </div>
    </form>
</body>
</html>





WebForm1.aspx.cs 程式碼


 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using HtmlAgilityPack;

namespace ScrapApp
{
    public partial class WebForm1 : System.Web.UI.Page
    {
        //https://stackoverflow.com/questions/307688/how-to-download-a-file-from-a-url-in-c
        //https://stackoverflow.com/questions/2113924/how-can-i-use-html-agility-pack-to-retrieve-all-the-images-from-a-website
        protected void Page_Load(object sender, EventArgs e)
        {

        }

        protected void btnScrap_Click(object sender, EventArgs e)
        {
            if (ddl_action.SelectedValue == "1")
            {
                string strHtmlDocText = WebCrawler.GetHtmlDocText(txtURL.Text);
                TextArea1.InnerText = strHtmlDocText;
            }
            else if (ddl_action.SelectedValue == "2")
            {
                string strLinksOfImage = WebCrawler.GetAllImageLinks(txtURL.Text);
                TextArea1.InnerText = strLinksOfImage;
            }
        }

        protected void btnBatchDownload_Click(object sender, EventArgs e)
        {
            WebCrawler.BatchDownloadImages(txtURL.Text, @"D:\ImgData");
        }

    }
}


兩個Class程式

WebUtility.cs 
定義從URL獲取副檔名

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;

namespace ScrapApp
{
    public class WebUtility
    {




        /// <summary>
        /// 從URL中取得副檔名
        /// </summary>
        /// <param name="strURL"></param>
        /// <returns></returns>
        public static string GetFileExtensionFromUrl(string strURL)
        {
            strURL = strURL.Split('?')[0];
            strURL = strURL.Split('/').Last();
            return strURL.Contains('.') ? strURL.Substring(strURL.LastIndexOf('.')) : "";
        }
    }
}




WebCrawler.cs

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Web;

namespace ScrapApp
{
    public class WebCrawler : WebUtility
    {

        //https://stackoverflow.com/questions/26189953/how-to-get-current-domain-name-in-asp-net/26190007
        //https://docs.microsoft.com/zh-tw/dotnet/api/system.uripartial?view=netcore-3.1#System_UriPartial_Authority
        //https://docs.microsoft.com/zh-tw/dotnet/api/system.uri.getleftpart?view=netcore-3.1

        public static string GetHtmlDocText(string strURL)
        {
            return GetHtmlDocObj(strURL).Text;
        }

        private static HtmlDocument GetHtmlDocObj(string strURL)
        {
            using (WebClient webClient = new WebClient())
            {
                using (MemoryStream memoryStream = new MemoryStream(webClient.DownloadData(strURL)))
                {
                    HtmlDocument doc = new HtmlDocument();
                    doc.Load(memoryStream, Encoding.UTF8);
                    return doc;
                }
            }
        }


        public static void BatchDownloadImages(string strURL, string saveDir, string fileName = "img", int beginIdx = 1, int interval = 1)
        {
            try
            {
                HtmlDocument doc = GetHtmlDocObj(strURL);

                Uri myUri = new Uri(strURL);                
                string Uri = myUri.GetLeftPart(UriPartial.Authority);//獲取URI 的配置和授權區段(避免取得的圖片連結會有相對(不完整)路徑問題

                var img_urls = doc.DocumentNode.Descendants("img")
                                .Select(ele => ele.GetAttributeValue("src", null))
                                .Where(s => !String.IsNullOrEmpty(s));
                List<string> lsImgUrl = img_urls.ToList();
                int idx = beginIdx;
                foreach (string item in lsImgUrl)
                {
                    string imgURL = "";
                    if (!item.StartsWith("http"))
                    {
                        imgURL = item.Insert(0, myUri.GetLeftPart(UriPartial.Authority));
                    }
                    else
                    {
                        imgURL = item;
                    }
                    string fileExt = GetFileExtensionFromUrl(imgURL);
                    string SaveFilePath = Path.Combine(saveDir, fileName + String.Format("_{0}{1}", idx, fileExt));
                    WebClient webClientImg = new WebClient();
                    webClientImg.DownloadFile(imgURL, SaveFilePath);
                    //webClientImg.DownloadFile(imgURL, String.Format(@"D:\ImgData\img_{0}{1}", idx, fileExt));
                    idx += interval;
                }
            }
            catch (Exception ex)
            {
                throw;
            }
        }


        public static string GetAllImageLinks(string strURL)
        {
            HtmlDocument doc = GetHtmlDocObj(strURL);
            Uri myUri = new Uri(strURL);
            string Uri = myUri.GetLeftPart(UriPartial.Authority);
            var img_urls = doc.DocumentNode.Descendants("img")
                            .Select(ele => ele.GetAttributeValue("src", null))
                            .Where(s => !String.IsNullOrEmpty(s));
            List<string> lsImgUrl = img_urls.ToList();
            StringBuilder sbResult = new StringBuilder();
            foreach (string item in lsImgUrl)
            {
                string imgURL = "";
                //https://www.taifex.com.tw
                if (!item.StartsWith("http"))
                {
                    imgURL = item.Insert(0, myUri.GetLeftPart(UriPartial.Authority));
                }
                else
                {
                    imgURL = item;
                }
                sbResult.AppendLine(imgURL);
            }
            return sbResult.ToString();
        }

    }
}


在此用這個網頁做測試
https://www.taifex.com.tw/cht/5/stockMargining









最終結果


















留言

這個網誌中的熱門文章

經得起原始碼資安弱點掃描的程式設計習慣培養(五)_Missing HSTS Header

經得起原始碼資安弱點掃描的程式設計習慣培養(三)_7.Cross Site Scripting(XSS)_Stored XSS_Reflected XSS All Clients

(2021年度)駕訓學科筆試準備題庫歸納分析_法規是非題