[C#爬蟲_HtmlAgilityPack使用]_如何透過C#爬蟲批量將當前網頁圖片全下載下來

- 11月 02, 2020

一個網頁上若要去捕抓下載所有png , jpg , gif的圖檔

此時就要透過網頁爬蟲(web crawler)

在此我們透過C#搭配vs2019 和HtmlAgilityPack這個套件進行開發

HtmlAgilityPack套件

https://html-agility-pack.net/

授權採用 MIT license

nuget上也可直接配置安裝

首先aspx網頁介面部分

WebForm1.aspx 程式碼

<%@ Page Language="C#" AutoEventWireup="true" CodeBehind="WebForm1.aspx.cs" Inherits="ScrapApp.WebForm1" %>

<!DOCTYPE html>

<html xmlns="http://www.w3.org/1999/xhtml">
<head runat="server">
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
    <title></title>
    <style type="text/css">
        #TextArea1 {
            height: 458px;
            width: 976px;
        }
    </style>
</head>
<body>
    <form id="form1" runat="server">
        <div>
            <asp:Label ID="Label1" runat="server" Text="URL:"></asp:Label>            
            <asp:TextBox ID="txtURL" runat="server"></asp:TextBox>
            <br/>
            <asp:DropDownList ID="ddl_action" runat="server">
                <asp:ListItem Value="1">爬取html文本</asp:ListItem>
                <asp:ListItem Value="2">爬取圖片連結</asp:ListItem>
            </asp:DropDownList>
            <asp:Button ID="btnScrap" runat="server" Text="web scrap" OnClick="btnScrap_Click" />
            <br/>
            <asp:Button ID="btnBatchDownload" runat="server" Text="Download Images" OnClick="btnBatchDownload_Click"/>
            <br />
            <textarea id="TextArea1" runat="server"></textarea>
        </div>
    </form>
</body>
</html>

WebForm1.aspx.cs 程式碼

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using HtmlAgilityPack;

namespace ScrapApp
{
    public partial class WebForm1 : System.Web.UI.Page
    {
        //https://stackoverflow.com/questions/307688/how-to-download-a-file-from-a-url-in-c
        //https://stackoverflow.com/questions/2113924/how-can-i-use-html-agility-pack-to-retrieve-all-the-images-from-a-website
        protected void Page_Load(object sender, EventArgs e)
        {

        }

        protected void btnScrap_Click(object sender, EventArgs e)
        {
            if (ddl_action.SelectedValue == "1")
            {
                string strHtmlDocText = WebCrawler.GetHtmlDocText(txtURL.Text);
                TextArea1.InnerText = strHtmlDocText;
            }
            else if (ddl_action.SelectedValue == "2")
            {
                string strLinksOfImage = WebCrawler.GetAllImageLinks(txtURL.Text);
                TextArea1.InnerText = strLinksOfImage;
            }
        }

        protected void btnBatchDownload_Click(object sender, EventArgs e)
        {
            WebCrawler.BatchDownloadImages(txtURL.Text, @"D:\ImgData");
        }

    }
}

兩個Class程式

WebUtility.cs

定義從URL獲取副檔名

using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;

namespace ScrapApp
{
    public class WebUtility
    {




        /// <summary>
        /// 從URL中取得副檔名
        /// </summary>
        /// <param name="strURL"></param>
        /// <returns></returns>
        public static string GetFileExtensionFromUrl(string strURL)
        {
            strURL = strURL.Split('?')[0];
            strURL = strURL.Split('/').Last();
            return strURL.Contains('.') ? strURL.Substring(strURL.LastIndexOf('.')) : "";
        }
    }
}

WebCrawler.cs

using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Web;

namespace ScrapApp
{
    public class WebCrawler : WebUtility
    {

        //https://stackoverflow.com/questions/26189953/how-to-get-current-domain-name-in-asp-net/26190007
        //https://docs.microsoft.com/zh-tw/dotnet/api/system.uripartial?view=netcore-3.1#System_UriPartial_Authority
        //https://docs.microsoft.com/zh-tw/dotnet/api/system.uri.getleftpart?view=netcore-3.1

        public static string GetHtmlDocText(string strURL)
        {
            return GetHtmlDocObj(strURL).Text;
        }

        private static HtmlDocument GetHtmlDocObj(string strURL)
        {
            using (WebClient webClient = new WebClient())
            {
                using (MemoryStream memoryStream = new MemoryStream(webClient.DownloadData(strURL)))
                {
                    HtmlDocument doc = new HtmlDocument();
                    doc.Load(memoryStream, Encoding.UTF8);
                    return doc;
                }
            }
        }


        public static void BatchDownloadImages(string strURL, string saveDir, string fileName = "img", int beginIdx = 1, int interval = 1)
        {
            try
            {
                HtmlDocument doc = GetHtmlDocObj(strURL);

                Uri myUri = new Uri(strURL);                
                string Uri = myUri.GetLeftPart(UriPartial.Authority);//獲取URI 的配置和授權區段(避免取得的圖片連結會有相對(不完整)路徑問題

                var img_urls = doc.DocumentNode.Descendants("img")
                                .Select(ele => ele.GetAttributeValue("src", null))
                                .Where(s => !String.IsNullOrEmpty(s));
                List<string> lsImgUrl = img_urls.ToList();
                int idx = beginIdx;
                foreach (string item in lsImgUrl)
                {
                    string imgURL = "";
                    if (!item.StartsWith("http"))
                    {
                        imgURL = item.Insert(0, myUri.GetLeftPart(UriPartial.Authority));
                    }
                    else
                    {
                        imgURL = item;
                    }
                    string fileExt = GetFileExtensionFromUrl(imgURL);
                    string SaveFilePath = Path.Combine(saveDir, fileName + String.Format("_{0}{1}", idx, fileExt));
                    WebClient webClientImg = new WebClient();
                    webClientImg.DownloadFile(imgURL, SaveFilePath);
                    //webClientImg.DownloadFile(imgURL, String.Format(@"D:\ImgData\img_{0}{1}", idx, fileExt));
                    idx += interval;
                }
            }
            catch (Exception ex)
            {
                throw;
            }
        }


        public static string GetAllImageLinks(string strURL)
        {
            HtmlDocument doc = GetHtmlDocObj(strURL);
            Uri myUri = new Uri(strURL);
            string Uri = myUri.GetLeftPart(UriPartial.Authority);
            var img_urls = doc.DocumentNode.Descendants("img")
                            .Select(ele => ele.GetAttributeValue("src", null))
                            .Where(s => !String.IsNullOrEmpty(s));
            List<string> lsImgUrl = img_urls.ToList();
            StringBuilder sbResult = new StringBuilder();
            foreach (string item in lsImgUrl)
            {
                string imgURL = "";
                //https://www.taifex.com.tw
                if (!item.StartsWith("http"))
                {
                    imgURL = item.Insert(0, myUri.GetLeftPart(UriPartial.Authority));
                }
                else
                {
                    imgURL = item;
                }
                sbResult.AppendLine(imgURL);
            }
            return sbResult.ToString();
        }

    }
}

在此用這個網頁做測試

https://www.taifex.com.tw/cht/5/stockMargining

最終結果

搜尋此網誌

第25個冬天

[C#爬蟲_HtmlAgilityPack使用]_如何透過C#爬蟲批量將當前網頁圖片全下載下來

留言

張貼留言

這個網誌中的熱門文章

何謂淨重(Net Weight)、皮重(Tare Weight)與毛重(Gross Weight)

(2021年度)駕訓學科筆試準備題庫歸納分析_法規是非題

經得起原始碼資安弱點掃描的程式設計習慣培養(五)_Missing HSTS Header