[C#爬蟲_HtmlAgilityPack使用]_如何透過C#爬蟲批量將當前網頁圖片全下載下來
一個網頁上若要去捕抓下載所有png , jpg , gif的圖檔
此時就要透過網頁爬蟲(web crawler)
在此我們透過C#搭配vs2019 和HtmlAgilityPack這個套件進行開發
HtmlAgilityPack套件
https://html-agility-pack.net/
授權採用 MIT license
nuget上也可直接配置安裝
首先aspx網頁介面部分
WebForm1.aspx 程式碼
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | <%@ Page Language="C#" AutoEventWireup="true" CodeBehind="WebForm1.aspx.cs" Inherits="ScrapApp.WebForm1" %> <!DOCTYPE html> <html xmlns="http://www.w3.org/1999/xhtml"> <head runat="server"> <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> <title></title> <style type="text/css"> #TextArea1 { height: 458px; width: 976px; } </style> </head> <body> <form id="form1" runat="server"> <div> <asp:Label ID="Label1" runat="server" Text="URL:"></asp:Label> <asp:TextBox ID="txtURL" runat="server"></asp:TextBox> <br/> <asp:DropDownList ID="ddl_action" runat="server"> <asp:ListItem Value="1">爬取html文本</asp:ListItem> <asp:ListItem Value="2">爬取圖片連結</asp:ListItem> </asp:DropDownList> <asp:Button ID="btnScrap" runat="server" Text="web scrap" OnClick="btnScrap_Click" /> <br/> <asp:Button ID="btnBatchDownload" runat="server" Text="Download Images" OnClick="btnBatchDownload_Click"/> <br /> <textarea id="TextArea1" runat="server"></textarea> </div> </form> </body> </html> |
WebForm1.aspx.cs 程式碼
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 | using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Net; using System.Text; using System.Web; using System.Web.UI; using System.Web.UI.WebControls; using HtmlAgilityPack; namespace ScrapApp { public partial class WebForm1 : System.Web.UI.Page { //https://stackoverflow.com/questions/307688/how-to-download-a-file-from-a-url-in-c //https://stackoverflow.com/questions/2113924/how-can-i-use-html-agility-pack-to-retrieve-all-the-images-from-a-website protected void Page_Load(object sender, EventArgs e) { } protected void btnScrap_Click(object sender, EventArgs e) { if (ddl_action.SelectedValue == "1") { string strHtmlDocText = WebCrawler.GetHtmlDocText(txtURL.Text); TextArea1.InnerText = strHtmlDocText; } else if (ddl_action.SelectedValue == "2") { string strLinksOfImage = WebCrawler.GetAllImageLinks(txtURL.Text); TextArea1.InnerText = strLinksOfImage; } } protected void btnBatchDownload_Click(object sender, EventArgs e) { WebCrawler.BatchDownloadImages(txtURL.Text, @"D:\ImgData"); } } } |
兩個Class程式
WebUtility.cs
定義從URL獲取副檔名
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 | using System; using System.Collections.Generic; using System.Linq; using System.Web; namespace ScrapApp { public class WebUtility { /// <summary> /// 從URL中取得副檔名 /// </summary> /// <param name="strURL"></param> /// <returns></returns> public static string GetFileExtensionFromUrl(string strURL) { strURL = strURL.Split('?')[0]; strURL = strURL.Split('/').Last(); return strURL.Contains('.') ? strURL.Substring(strURL.LastIndexOf('.')) : ""; } } } |
WebCrawler.cs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 | using HtmlAgilityPack; using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Net; using System.Text; using System.Web; namespace ScrapApp { public class WebCrawler : WebUtility { //https://stackoverflow.com/questions/26189953/how-to-get-current-domain-name-in-asp-net/26190007 //https://docs.microsoft.com/zh-tw/dotnet/api/system.uripartial?view=netcore-3.1#System_UriPartial_Authority //https://docs.microsoft.com/zh-tw/dotnet/api/system.uri.getleftpart?view=netcore-3.1 public static string GetHtmlDocText(string strURL) { return GetHtmlDocObj(strURL).Text; } private static HtmlDocument GetHtmlDocObj(string strURL) { using (WebClient webClient = new WebClient()) { using (MemoryStream memoryStream = new MemoryStream(webClient.DownloadData(strURL))) { HtmlDocument doc = new HtmlDocument(); doc.Load(memoryStream, Encoding.UTF8); return doc; } } } public static void BatchDownloadImages(string strURL, string saveDir, string fileName = "img", int beginIdx = 1, int interval = 1) { try { HtmlDocument doc = GetHtmlDocObj(strURL); Uri myUri = new Uri(strURL); string Uri = myUri.GetLeftPart(UriPartial.Authority);//獲取URI 的配置和授權區段(避免取得的圖片連結會有相對(不完整)路徑問題 var img_urls = doc.DocumentNode.Descendants("img") .Select(ele => ele.GetAttributeValue("src", null)) .Where(s => !String.IsNullOrEmpty(s)); List<string> lsImgUrl = img_urls.ToList(); int idx = beginIdx; foreach (string item in lsImgUrl) { string imgURL = ""; if (!item.StartsWith("http")) { imgURL = item.Insert(0, myUri.GetLeftPart(UriPartial.Authority)); } else { imgURL = item; } string fileExt = GetFileExtensionFromUrl(imgURL); string SaveFilePath = Path.Combine(saveDir, fileName + String.Format("_{0}{1}", idx, fileExt)); WebClient webClientImg = new WebClient(); webClientImg.DownloadFile(imgURL, SaveFilePath); //webClientImg.DownloadFile(imgURL, String.Format(@"D:\ImgData\img_{0}{1}", idx, fileExt)); idx += interval; } } catch (Exception ex) { throw; } } public static string GetAllImageLinks(string strURL) { HtmlDocument doc = GetHtmlDocObj(strURL); Uri myUri = new Uri(strURL); string Uri = myUri.GetLeftPart(UriPartial.Authority); var img_urls = doc.DocumentNode.Descendants("img") .Select(ele => ele.GetAttributeValue("src", null)) .Where(s => !String.IsNullOrEmpty(s)); List<string> lsImgUrl = img_urls.ToList(); StringBuilder sbResult = new StringBuilder(); foreach (string item in lsImgUrl) { string imgURL = ""; //https://www.taifex.com.tw if (!item.StartsWith("http")) { imgURL = item.Insert(0, myUri.GetLeftPart(UriPartial.Authority)); } else { imgURL = item; } sbResult.AppendLine(imgURL); } return sbResult.ToString(); } } } |
在此用這個網頁做測試
https://www.taifex.com.tw/cht/5/stockMargining
最終結果
留言
張貼留言