微信公众号搜"智元新知"关注
微信扫一扫可直接关注哦!

C#实现网页爬虫

HTTP请求工具类(功能:1、获取网页html;2、下载网络图片;):

using System;
 System.Collections.Generic;
 System.Drawing;
 System.IO;
 System.Linq;
 System.Net;
 System.Text;
 System.Threading.Tasks;
 System.Windows.Forms;

namespace Utils
{
    /// <summary>
    /// HTTP请求工具类
    </summary>
    public class HttpRequestUtil
    {
        <summary>
         获取页面html
        </summary>
        static string GetPageHtml(string url)
        {
            // 设置参数
            HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
            request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)";
            发送请求并获取相应回应数据
            HttpWebResponse response = request.GetResponse()  HttpWebResponse;
            直到request.GetResponse()程序才开始向目标网页发送Post请求
            Stream responseStream = response.GetResponseStream();
            StreamReader sr = new StreamReader(responseStream,Encoding.UTF8);
            返回结果网页(html)代码
            string content = sr.ReadToEnd();
            return content;
        }

         Http下载文件
        void HttpDownloadFile(string url,int minWidth,1)">int minHeight)
        {
            int pos = url.LastIndexOf(/") + 1string fileName = url.Substring(pos);
            string path = Application.StartupPath + \\downloadif (!Directory.Exists(path))
            {
                Directory.CreateDirectory(path);
            }
            string filePathName = path + \\" + fileName;
            if (File.Exists(filePathName)) ;

            ;
            request.Proxy = null response.GetResponseStream();

            MemoryStream memoryStream =  MemoryStream();
            byte[] bArr = new byte[1024];
            int size = responseStream.Read(bArr,0,()bArr.Length);
            while (size > 0)
            {
                memoryStream.Write(bArr,,size);
                size = responseStream.Read(bArr,1)">)bArr.Length);
            }
            Image tempImage = System.Drawing.Image.FromStream(memoryStream,1)">true);
            int imageHeight = tempImage.Height;
            int imageWidth = tempImage.Width;
            if (imageHeight >= minHeight && imageWidth >= minWidth)
            {
                memoryStream.Seek()bArr.Length);
                FileStream fs =  FileStream(filePathName,FileMode.Create);
                )
                {
                    fs.Write(bArr,size);
                    size = memoryStream.Read(bArr,1)">)bArr.Length);
                }
                fs.Close();
            }
            memoryStream.Close();
            responseStream.Close();
        }
    }
}
View Code

VisitedHelper类:

 已访问的网址列表
     VisitedHelper
    {
        private static List<string> m_VisitedList = new List<string>();

        #region 判断是否已访问
         判断是否已访问
        bool IsVisited(if (m_VisitedList.Exists(a => a == url))
            {
                return ;
            }
            false;
        }
        #endregion

        #region 添加已访问
         添加已访问
        void Add( url)
        {
            m_VisitedList.Add(url);
        }
        #endregion

    }
}
View Code

多线程爬取网页代码

 System.ComponentModel;
 System.Data;
 System.Text.RegularExpressions;
 System.Threading;
 System.Windows.Forms;
 Utils;

 爬虫
{
    partial  Form1 : Form
    {
        int m_MinWidth = 300;
        int m_MinHeight = int m_CompletedCount = ;

        public Form1()
        {
            InitializeComponent();
        }

        void button1_Click(object sender,EventArgs e)
        {
            ThreadPool.SetMaxThreads(100,1)">100int.TryParse(txtMinWidth.Text,1)">out m_MinWidth);
            int.TryParse(txtMinHeight.Text,1)"> m_MinHeight);
            button1.Enabled = ;
            lblMsg.Text = 正在爬取图片;
            timer1.Start();
            new Thread(new ThreadStart(delegate()
            {
                Crawling(txtUrl.Text,);
            })).Start();
        }

         爬取
        void Crawling( host)
        {
            VisitedHelper.IsVisited(url))
            {
                VisitedHelper.Add(url);

                if (host == )
                {
                    host = GetHost(url);
                }

                string pageHtml = HttpRequestUtil.GetPageHtml(url);
                Regex regA = new Regex(@"<a[\s]+[^<>]*href=(?:""|')([^<>""']+)(?:""|')[^<>]*>[^<>]+</a>noreCase);
                Regex regImg = <img[\s]+[^<>]*src=(?:""|')([^<>""']+(?:jpg|jpeg|png|gif))(?:""|')[^<>]*>noreCase);

                MatchCollection mcImg = regImg.Matches(pageHtml);
                foreach (Match mImg in mcImg)
                {
                    string imageUrl = mImg.Groups[].Value;
                    try
                    {
                        int imageWidth = GetimageWidthOrHeight(mImg.Value,1)">);
                        int imageHeight = GetimageWidthOrHeight(imageUrl,1)">if (imageWidth >= m_MinWidth && imageHeight >= m_MinHeight)
                        {
                            if (imageUrl.IndexOf(javascript") == -)
                            {
                                http") == )
                                {
                                    HttpRequestUtil.HttpDownloadFile(imageUrl,m_MinWidth,m_MinHeight);
                                }
                                else
                                {
                                    HttpRequestUtil.HttpDownloadFile(host + imageUrl,m_MinHeight);
                                }
                            }
                        }
                    }
                    catch { }
                }

                递归遍历
                MatchCollection mcA = regA.Matches(pageHtml);
                foreach (Match mA  mcA)
                {
                    string nextUrl = mA.Groups[].Value;
                        if (nextUrl.IndexOf()
                        {
                            if (GetHost(url) == host)
                                {
                                    ThreadPool.QueueUserWorkItem(new WaitCallback(delegate( obj)
                                    {
                                        
                                        {
                                            Crawling(nextUrl,host);
                                            m_CompletedCount++;
                                        }
                                         { }
                                    }));
                                }
                            }
                            
                            {
                                
                                        {
                                            Crawling(host + nextUrl,1)"> { }
                                    }));
                                }
                            }
                        }
                    }
                     { }
                }
            }
        } end Crawling方法

         获取主机
        string GetHost( url)
        {
            Regex regHost = (?:http|https)://[a-z0-9\-\.:]+noreCase);
            Match mHost = regHost.Match(url);
            return mHost.Value + ;
        }

        计时器事件
        void timer1_Tick( workerThreads;
             completionPortThreads;
            ThreadPool.GetAvailableThreads(out workerThreads,1)"> completionPortThreads);
            if (workerThreads == 100 && m_CompletedCount > )
            {
                lblMsg.Text = 已结束
            {
                lblMsg.Text = ;
            }
        }

         获取图片宽度或高度
        int GetimageWidthOrHeight(string imageTagString,1)">bool isWidth)
        {
            string tag = isWidth ? width" : height;
            Regex reg = new Regex(string.Format({0}=""([\d\.]+)""noreCase);
            Match match = reg.Match(imageTagString);
            if (match.Success)
            {
                return (int)Convert.Todouble(match.Groups[].Value);
            }
            
            {
                reg = {0}[\s]*:[\s]*([\d\.]+)[\s]*px[\s]*;noreCase);
                match = reg.Match(imageTagString);
                 (match.Success)
                {
                    ].Value);
                }
            }
            .MaxValue;
        }

    } end Form1类

     跨线程访问控件的委托
    delegate void InvokeDelegate();
}
View Code

截图:

 

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 [email protected] 举报,一经查实,本站将立刻删除。

相关推荐