HTTP请求工具类(功能:1、获取网页html;2、下载网络图片;):
using System; System.Collections.Generic; System.Drawing; System.IO; System.Linq; System.Net; System.Text; System.Threading.Tasks; System.Windows.Forms; namespace Utils { /// <summary> /// HTTP请求工具类 </summary> public class HttpRequestUtil { <summary> 获取页面html </summary> static string GetPageHtml(string url) { // 设置参数 HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest; request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"; 发送请求并获取相应回应数据 HttpWebResponse response = request.GetResponse() HttpWebResponse; 直到request.GetResponse()程序才开始向目标网页发送Post请求 Stream responseStream = response.GetResponseStream(); StreamReader sr = new StreamReader(responseStream,Encoding.UTF8); 返回结果网页(html)代码 string content = sr.ReadToEnd(); return content; } Http下载文件 void HttpDownloadFile(string url,int minWidth,1)">int minHeight) { int pos = url.LastIndexOf(/") + 1string fileName = url.Substring(pos); string path = Application.StartupPath + \\downloadif (!Directory.Exists(path)) { Directory.CreateDirectory(path); } string filePathName = path + \\" + fileName; if (File.Exists(filePathName)) ; ; request.Proxy = null response.GetResponseStream(); MemoryStream memoryStream = MemoryStream(); byte[] bArr = new byte[1024]; int size = responseStream.Read(bArr,0,()bArr.Length); while (size > 0) { memoryStream.Write(bArr,,size); size = responseStream.Read(bArr,1)">)bArr.Length); } Image tempImage = System.Drawing.Image.FromStream(memoryStream,1)">true); int imageHeight = tempImage.Height; int imageWidth = tempImage.Width; if (imageHeight >= minHeight && imageWidth >= minWidth) { memoryStream.Seek()bArr.Length); FileStream fs = FileStream(filePathName,FileMode.Create); ) { fs.Write(bArr,size); size = memoryStream.Read(bArr,1)">)bArr.Length); } fs.Close(); } memoryStream.Close(); responseStream.Close(); } } }
VisitedHelper类:
已访问的网址列表 VisitedHelper { private static List<string> m_VisitedList = new List<string>(); #region 判断是否已访问 判断是否已访问 bool IsVisited(if (m_VisitedList.Exists(a => a == url)) { return ; } false; } #endregion #region 添加已访问 添加已访问 void Add( url) { m_VisitedList.Add(url); } #endregion } }
多线程爬取网页代码:
System.ComponentModel; System.Data; System.Text.RegularExpressions; System.Threading; System.Windows.Forms; Utils; 爬虫 { partial Form1 : Form { int m_MinWidth = 300; int m_MinHeight = int m_CompletedCount = ; public Form1() { InitializeComponent(); } void button1_Click(object sender,EventArgs e) { ThreadPool.SetMaxThreads(100,1)">100int.TryParse(txtMinWidth.Text,1)">out m_MinWidth); int.TryParse(txtMinHeight.Text,1)"> m_MinHeight); button1.Enabled = ; lblMsg.Text = 正在爬取图片…; timer1.Start(); new Thread(new ThreadStart(delegate() { Crawling(txtUrl.Text,); })).Start(); } 爬取 void Crawling( host) { VisitedHelper.IsVisited(url)) { VisitedHelper.Add(url); if (host == ) { host = GetHost(url); } string pageHtml = HttpRequestUtil.GetPageHtml(url); Regex regA = new Regex(@"<a[\s]+[^<>]*href=(?:""|')([^<>""']+)(?:""|')[^<>]*>[^<>]+</a>noreCase); Regex regImg = <img[\s]+[^<>]*src=(?:""|')([^<>""']+(?:jpg|jpeg|png|gif))(?:""|')[^<>]*>noreCase); MatchCollection mcImg = regImg.Matches(pageHtml); foreach (Match mImg in mcImg) { string imageUrl = mImg.Groups[].Value; try { int imageWidth = GetimageWidthOrHeight(mImg.Value,1)">); int imageHeight = GetimageWidthOrHeight(imageUrl,1)">if (imageWidth >= m_MinWidth && imageHeight >= m_MinHeight) { if (imageUrl.IndexOf(javascript") == -) { http") == ) { HttpRequestUtil.HttpDownloadFile(imageUrl,m_MinWidth,m_MinHeight); } else { HttpRequestUtil.HttpDownloadFile(host + imageUrl,m_MinHeight); } } } } catch { } } 递归遍历 MatchCollection mcA = regA.Matches(pageHtml); foreach (Match mA mcA) { string nextUrl = mA.Groups[].Value; if (nextUrl.IndexOf() { if (GetHost(url) == host) { ThreadPool.QueueUserWorkItem(new WaitCallback(delegate( obj) { { Crawling(nextUrl,host); m_CompletedCount++; } { } })); } } { { Crawling(host + nextUrl,1)"> { } })); } } } } { } } } } end Crawling方法 获取主机 string GetHost( url) { Regex regHost = (?:http|https)://[a-z0-9\-\.:]+noreCase); Match mHost = regHost.Match(url); return mHost.Value + ; } 计时器事件 void timer1_Tick( workerThreads; completionPortThreads; ThreadPool.GetAvailableThreads(out workerThreads,1)"> completionPortThreads); if (workerThreads == 100 && m_CompletedCount > ) { lblMsg.Text = 已结束 { lblMsg.Text = ; } } 获取图片宽度或高度 int GetimageWidthOrHeight(string imageTagString,1)">bool isWidth) { string tag = isWidth ? width" : height; Regex reg = new Regex(string.Format({0}=""([\d\.]+)""noreCase); Match match = reg.Match(imageTagString); if (match.Success) { return (int)Convert.Todouble(match.Groups[].Value); } { reg = {0}[\s]*:[\s]*([\d\.]+)[\s]*px[\s]*;noreCase); match = reg.Match(imageTagString); (match.Success) { ].Value); } } .MaxValue; } } end Form1类 跨线程访问控件的委托 delegate void InvokeDelegate(); }
截图:
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 [email protected] 举报,一经查实,本站将立刻删除。