11
2015
07

【C#、Asp.Net 工具类大全】Html常用帮助类

/// <summary>
/// Html常用帮助类
/// </summary>
public class HtmlHelper
{
    #region 获取页面源代码
    /// <summary>
    /// 获取网页源代码
    /// </summary>
    /// <param name="url">URL路径</param>
    /// <param name="encoding">编码方式</param>
    /// <returns></returns>
    public static string GetHTML(string url, string encoding)
    {
        WebClient web = new WebClient();
        byte[] buffer = web.DownloadData(url);
        return Encoding.GetEncoding(encoding).GetString(buffer);
    }
    /// <summary>
    /// WebClient读取源代码
    /// </summary>
    /// <param name="url">URL路径</param>
    /// <param name="encoding">编码方式</param>
    /// <returns></returns>
    public static string GetWebClient(string url, string encoding)
    {
        string strHTML = "";
        WebClient myWebClient = new WebClient();
        Stream myStream = myWebClient.OpenRead(url);
        StreamReader sr = new StreamReader(myStream, System.Text.Encoding.GetEncoding(encoding));
        strHTML = sr.ReadToEnd();
        myStream.Close();
        return strHTML;
    }
    /// <summary>
    /// WebRequest读取源代码
    /// </summary>
    /// <param name="url">URL路径</param>
    /// <param name="encoding">编码方式</param>
    /// <returns></returns>
    public static string GetWebRequest(string url, string encoding)
    {
        Uri uri = new Uri(url);
        WebRequest myReq = WebRequest.Create(uri);
        WebResponse result = myReq.GetResponse();
        Stream receviceStream = result.GetResponseStream();
        StreamReader readerOfStream = new StreamReader(receviceStream, System.Text.Encoding.GetEncoding(encoding));
        string strHTML = readerOfStream.ReadToEnd();
        readerOfStream.Close();
        receviceStream.Close();
        result.Close();
        return strHTML;
    }
    /// <summary>
    /// HttpWebRequest读取源代码
    /// </summary>
    /// <param name="url">URL路径</param>
    /// <param name="encoding">编码方式</param>
    /// <returns></returns>
    public static string GetHttpWebRequest(string url, string encoding)
    {
        Uri uri = new Uri(url);
        HttpWebRequest myReq = (HttpWebRequest)WebRequest.Create(uri);
        myReq.UserAgent = "User-Agent:Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705";
        myReq.Accept = "*/*";
        myReq.KeepAlive = true;
        myReq.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
        HttpWebResponse result = (HttpWebResponse)myReq.GetResponse();
        Stream receviceStream = result.GetResponseStream();
        StreamReader readerOfStream = new StreamReader(receviceStream, System.Text.Encoding.GetEncoding(encoding));
        string strHTML = readerOfStream.ReadToEnd();
        readerOfStream.Close();
        receviceStream.Close();
        result.Close();
        return strHTML;
    }
    /// <summary>       
    /// 获取HTML源码信息(Porschev)       
    /// </summary>       
    /// <param name="url">获取地址</param>       
    /// <returns>HTML源码</returns>       
    public static string GetHtmlCode(string url)
    {
        string str = "";
        try
        {
            Uri uri = new Uri(url);
            WebRequest wr = WebRequest.Create(uri);
            Stream s = wr.GetResponse().GetResponseStream();
            StreamReader sr = new StreamReader(s, Encoding.Default);
            do
            {
                string strLine = "";
                strLine = sr.ReadLine();//  读取一行字符并返回
                str += strLine + "\r\n";
            } while (!sr.EndOfStream);
        }
        catch (Exception e)
        {
        }
        return str;
    } 
    #endregion
    #region 清除格式化html标记
    ///<summary>   
    ///清除 获取到的 html 源码里面的所有标记   
    ///</summary>   
    ///<param name="Html">html 源码</param>   
    ///<returns>已经去除后的字符串</returns>   
    public static string RemoveHtml(string Html)
    {
        //删除脚本   
        Html = Regex.Replace(Html, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
        //删除HTML   
        Regex regex = new Regex("<.+?>", RegexOptions.IgnoreCase);
        Html = regex.Replace(Html, "");
        Html = Regex.Replace(Html, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
        Html = Regex.Replace(Html, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
        Html = Regex.Replace(Html, @"-->", "", RegexOptions.IgnoreCase);
        Html = Regex.Replace(Html, @"<!--.*", "", RegexOptions.IgnoreCase);
        Html = Regex.Replace(Html, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
        Html = Regex.Replace(Html, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
        Html = Regex.Replace(Html, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
        Html = Regex.Replace(Html, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
        Html = Regex.Replace(Html, @"&(nbsp|#160);", "   ", RegexOptions.IgnoreCase);
        Html = Regex.Replace(Html, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
        Html = Regex.Replace(Html, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
        Html = Regex.Replace(Html, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
        Html = Regex.Replace(Html, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
        Html = Regex.Replace(Html, @"&#(\d+);", "", RegexOptions.IgnoreCase);
        Html.Replace("<", "");
        Html.Replace(">", "");
        Html.Replace("\r\n", "");
        return Html;
    }
    /// <summary>
    /// 压缩获取到的 Html 字符串(删除换行字符串)
    /// </summary>
    /// <param name="Html">Html 源代码</param>
    /// <returns></returns>
    public static string ZipHtml(string Html)
    {
        Html = Regex.Replace(Html, @">\s+?<", "><");//去除HTML中的空白字符
        Html = Regex.Replace(Html, @"\r\n\s*", "");
        Html = Regex.Replace(Html, @"<body([\s|\S]*?)>([\s|\S]*?)</body>", @"<body$1>$2</body>", RegexOptions.IgnoreCase);
        return Html;
    }
    /// <summary>
    /// 格式化还原获取到的 Html 特殊符合代码(直接显示的html标记元素)
    /// </summary>
    /// <param name="Html">Html 源代码</param>
    /// <returns></returns>
    public static string FormatHtml(string Html)
    {
        Regex r;
        Match m;
        #region 处理空格
        Html = Html.Replace(" ", "&nbsp;");
        #endregion
        #region 处理单引号
        Html = Html.Replace("'", "’");
        #endregion
        #region 处理双引号
        Html = Html.Replace("\"", "&quot;");
        #endregion
        #region html标记符
        Html = Html.Replace("<", "&lt;");
        Html = Html.Replace(">", "&gt;");
        #endregion
        #region 处理换行
        //处理换行,在每个新行的前面添加两个全角空格
        r = new Regex(@"(\r\n((&nbsp;)| )+)(?<正文>\S+)", RegexOptions.IgnoreCase);
        for (m = r.Match(Html); m.Success; m = m.NextMatch())
        {
            Html = Html.Replace(m.Groups[0].ToString(), "<BR>  " + m.Groups["正文"].ToString());
        }
        //处理换行,在每个新行的前面添加两个全角空格
        Html = Html.Replace("\r\n", "<BR>");
        #endregion
        return Html;
    }
    /// <summary>
    /// 除去所有在html元素中标记
    /// </summary>
    /// <param name="strhtml">Html 源代码</param>
    /// <returns></returns>
    public static string StripHtml(string strhtml)
    {
        string stroutput = strhtml;
        Regex regex = new Regex(@"<[^>]+>|</[^>]+>");
        stroutput = regex.Replace(stroutput, "");
        return stroutput;
    }
    #endregion
    #region 文本中字符的转换
    /// <summary>
    /// 将文本格式转换为html代码
    /// </summary>
    /// <param name="str">要格式化的字符串</param>
    /// <returns>格式化后的字符串</returns>
    public static String ToHtml(string str)
    {
        if (str == null || str.Equals(""))
        {
            return str;
        }
        StringBuilder sb = new StringBuilder(str);
        sb.Replace("&", "&amp;");
        sb.Replace("<", "&lt;");
        sb.Replace(">", "&gt;");
        sb.Replace("\r\n", "<br>");
        sb.Replace("\n", "<br>");
        sb.Replace("\t", " ");
        sb.Replace(" ", "&nbsp;");
        return sb.ToString();
    }
    /// <summary>
    /// 将HTML代码转化成文本格式
    /// </summary>
    /// <param name="str">要格式化的字符串</param>
    /// <returns>格式化后的字符串</returns>
    public static String ToTxt(String str)
    {
        if (str == null || str.Equals(""))
        {
            return str;
        }
        StringBuilder sb = new StringBuilder(str);
        sb.Replace("&nbsp;", " ");
        sb.Replace("<br>", "\r\n");
        sb.Replace("&lt;", "<");
        sb.Replace("&gt;", ">");
        sb.Replace("&amp;", "&");
        return sb.ToString();
    }
    #endregion
    #region HTML特殊字符转换
    /// <summary>
    /// 替换html中的特殊字符
    /// </summary>
    /// <param name="theString">需要进行替换的文本。</param>
    /// <returns>替换完的文本。</returns>
    public static string HtmlEncode(string theString)
    {
        theString = theString.Replace(">", "&gt;");
        theString = theString.Replace("<", "&lt;");
        theString = theString.Replace("  ", " &nbsp;");
        theString = theString.Replace("\"", "&quot;");
        theString = theString.Replace("'", "&#39;");
        theString = theString.Replace("\r\n", "<br/> ");
        return theString;
    }
    /// <summary>
    /// 恢复html中的特殊字符
    /// </summary>
    /// <param name="theString">需要恢复的文本。</param>
    /// <returns>恢复好的文本。</returns>
    public static string HtmlDecode(string theString)
    {
        theString = theString.Replace("&gt;", ">");
        theString = theString.Replace("&lt;", "<");
        theString = theString.Replace(" &nbsp;", "  ");
        theString = theString.Replace("&quot;", "\"");
        theString = theString.Replace("&#39;", "'");
        theString = theString.Replace("<br/> ", "\r\n");
        theString = theString.Replace("&mdash;", "—");//2012-05-07新加的
        return theString;
    }
    #endregion
    #region html中读取a标签的href值
    /// <summary>
    /// 正则表达式获取html超链接及对应链接里面的内容
    /// </summary>
    /// <param name="content">html 源代码</param>
    /// <returns></returns>
    public static Dictionary<string, string> GetUrl(string content)
    {
        Dictionary<string, string> dics = new Dictionary<string, string>();
        string pattern = @"(?is)<a[^>]*?href=(['""]?)(?<url>[^'""\s>]+)\1[^>]*>(?<text>(?:(?!</?a\b).)*)</a>";
        MatchCollection mc = Regex.Matches(content, pattern);
        foreach (Match m in mc)
        {
            if (m.Success)
            {
                //加入集合数组
                //hrefList.Add(m.Groups["href"].Value);
                //nameList.Add(m.Groups["name"].Value);
                try
                {
                    dics.Add(m.Groups["url"].Value, m.Groups["text"].Value);
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex.Message);
                }
            }
        }
        return dics;
    }
    #endregion
    #region html中获取图片
    /// <summary>
    /// 取得HTML中首张图片的 URL
    /// </summary>
    /// <param name="sHtmlText">HTML代码</param>
    /// <returns>图片的源地址列表</returns>
    public static string getHtmlFirstImage(string sHtmlText)
    {
        // 定义正则表达式用来匹配 img 标签
        Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);
        // 搜索匹配的字符串
        MatchCollection matches = regImg.Matches(sHtmlText);
        string ImgStr = "";
        // 取得匹配项列表
        if (matches != null && matches.Count > 0)
        {
            for (int i = 0; i < matches.Count; i++)
            {
                string sUrl = matches[i].Groups["imgUrl"].Value.ToString();
                if (sUrl != "")
                {
                    ImgStr = sUrl;
                    break;
                }
            }
        }
        return ImgStr;
    }
    /// <summary>
    /// 取得HTML中图片的列表,用“|”分割
    /// </summary>
    /// <param name="sHtmlText">HTML代码</param>
    /// <returns>图片的源地址列表</returns>
    public static string getHtmlImageList(string sHtmlText)
    {
        // 定义正则表达式用来匹配 img 标签
        Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);
        // 搜索匹配的字符串
        MatchCollection matches = regImg.Matches(sHtmlText);
        string ImgStr = "";
        // 取得匹配项列表
        if (matches != null && matches.Count > 0)
        {
            for (int i = 0; i < matches.Count; i++)
            {
                string sUrl = matches[i].Groups["imgUrl"].Value.ToString();
                if (ImgStr != "")
                {
                    ImgStr += "|";
                }
                ImgStr += sUrl;
            }
        }
        return ImgStr;
    }
    /// <summary>
    /// 取得HTML中所有图片src的源地址。
    /// </summary>
    /// <param name="sHtmlText">HTML代码</param>
    /// <returns>src的源地址列表</returns>
    public static ArrayList GetHtmlSrcUrlList(string sHtmlText)
    {
        // 定义正则表达式用来匹配 img 标签
        //Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);
        Regex regImg = new Regex(@" \b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]* ");
        // 搜索匹配的字符串
        MatchCollection matches = regImg.Matches(sHtmlText);
        int i = 0;
        ArrayList sUrlList = new ArrayList();
        // 取得匹配项列表
        foreach (Match match in matches)
        {
            if (!sUrlList.Contains(match.Groups["imgUrl"].Value)) sUrlList.Add(match.Groups["imgUrl"].Value);
        }
        return sUrlList;
    }
    /// <summary>
    /// 格式化HTML中图片的img,宽度100%,高度100%,请加上链接<a href=show://。
    /// </summary>
    /// <param name="sHtmlText">HTML代码</param>
    /// <param name="styleStr">HTML样式代码</param>
    /// <returns>图片的源地址列表</returns>
    public static string ClearHtmlImageHW(string sHtmlText, string styleStr)
    {
        // 定义正则表达式用来匹配 img 标签
        Regex regImg = new Regex(@"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>", RegexOptions.IgnoreCase);
        // 搜索匹配的字符串
        MatchCollection matches = regImg.Matches(sHtmlText);
        // 取得匹配项列表
        if (matches != null && matches.Count > 0)
        {
            for (int i = 0; i < matches.Count; i++)
            {
                string imgUrl = matches[i].ToString();
                string sUrl = matches[i].ToString();
                string nUrl = "";
                if (sUrl.IndexOf("width") > -1 || sUrl.IndexOf("height") > -1 || sUrl.IndexOf("style") > -1)
                {
                    nUrl = Regex.Replace(sUrl, @"(?<=<img[\s\S]*?)style=((['""])[^'""]*\2|\S+)(?=[^>]*>)", "", RegexOptions.IgnoreCase);
                    nUrl = Regex.Replace(nUrl, @"(?<=<img[\s\S]*?)width=((['""])[^'""]*\2|\S+)(?=[^>]*>)", "", RegexOptions.IgnoreCase);
                    nUrl = Regex.Replace(nUrl, @"(?<=<img[\s\S]*?)height=((['""])[^'""]*\2|\S+)(?=[^>]*>)", "", RegexOptions.IgnoreCase);
                    if (nUrl.IndexOf(" />") != -1)
                    {
                        nUrl = nUrl.Replace(" />", "");
                        nUrl += " " + styleStr + " />";
                    }
                    else if (nUrl.IndexOf("/>") != -1)
                    {
                        nUrl = nUrl.Replace("/>", "");
                        nUrl += " " + styleStr + " />";
                    }
                    else
                    {
                        nUrl = nUrl.Replace(">", "");
                        nUrl += " " + styleStr + " />";
                    }
                    sHtmlText = sHtmlText.Replace(sUrl, nUrl);
                }
                else
                {
                    if (sUrl.IndexOf(" />") != -1)
                    {
                        nUrl = sUrl.Replace(" />", "");
                        nUrl += " " + styleStr + " />";
                    }
                    else if (sUrl.IndexOf("/>") != -1)
                    {
                        nUrl = sUrl.Replace("/>", "");
                        nUrl += " " + styleStr + " />";
                    }
                    else
                    {
                        nUrl = sUrl.Replace(">", "");
                        nUrl += " " + styleStr + " />";
                    }
                    sHtmlText = sHtmlText.Replace(sUrl, nUrl);
                }
                if (nUrl != "")
                {
                    imgUrl = nUrl;
                }
                else
                {
                    imgUrl = sUrl;
                }
                //sHtmlText = sHtmlText.Replace(imgUrl, "<a href=show://" + (i + 1) + ">" + imgUrl + "</a>");
            }
        }
        return sHtmlText;
    }
    #endregion
    #region 页面信息读取
    /// <summary>
    /// 获取请求地址的主机名称
    /// </summary>
    /// <returns></returns>
    public static string Host(string url)
    {
        try
        {
            if (!url.ToLower().Contains("http://") && !url.ToLower().Contains("https://"))
            {
                url = "http://" + url;
            }
            Uri uri = new Uri(url);
            return uri.Host;
        }
        catch (Exception)
        {
            return string.Empty;
        }
    }
    /// <summary>
    /// 获取网页标题
    /// </summary>
    /// <param name="html">html源</param>
    /// <returns></returns>
    public static string Title(string html)
    {
        string titleReg = "<title>([^<]+)</title>";
        return MatchHelper.MatchScalar(html, titleReg);
    }
    /// <summary>
    /// 根据主机名获取对于的IP
    /// </summary>
    /// <param name="host">url</param>
    /// <returns>返回Url对应的IP地址</returns>
    public static string Ip(string host)
    {
        try
        {
            IPHostEntry hostInfo = Dns.GetHostEntry(host);
            return hostInfo.AddressList[0].ToString();
        }
        catch (Exception)
        {
            return string.Empty;
        }
    }
    /// <summary>
    /// 获取Url地址后面的参数键值集
    /// </summary>
    /// <param name="url">url</param>
    /// <returns></returns>
    public static NameValueCollection UrlParseQuery(string url)
    {
        try
        {
            return HttpUtility.ParseQueryString(url);
        }
        catch (Exception)
        {
            return null;
        }
    }
    /// <summary>
    /// Url解码
    /// </summary>
    /// <param name="url">url</param>
    /// <returns></returns>
    public static string UrlDecode(string url)
    {
        try
        {
            return HttpUtility.UrlDecode(url);
        }
        catch (Exception)
        {
            return url;
        }
    }
    /// <summary>
    /// Url编码
    /// </summary>
    /// <param name="url">url</param>
    /// <returns></returns>
    public static string UrlEncode(string url)
    {
        try
        {
            return HttpUtility.UrlEncode(url);
        }
        catch (Exception)
        {
            return url;
        }
    }
    #endregion
    #region 获得用户IP
    /// <summary>
    /// 获得用户IP
    /// </summary>
    public static string GetUserIp()
    {
        string ip;
        string[] temp;
        bool isErr = false;
        if (System.Web.HttpContext.Current.Request.ServerVariables["HTTP_X_ForWARDED_For"] == null)
            ip = System.Web.HttpContext.Current.Request.ServerVariables["REMOTE_ADDR"].ToString();
        else
            ip = System.Web.HttpContext.Current.Request.ServerVariables["HTTP_X_ForWARDED_For"].ToString();
        if (ip.Length > 15)
            isErr = true;
        else
        {
            temp = ip.Split('.');
            if (temp.Length == 4)
            {
                for (int i = 0; i < temp.Length; i++)
                {
                    if (temp[i].Length > 3) isErr = true;
                }
            }
            else
                isErr = true;
        }
        if (isErr)
            return "1.1.1.1";
        else
            return ip;
    }
    #endregion
    #region 通过网络获取IP
    private string url = "http://www.proxy360.cn/default.aspx";
    private string url1 = "http://www.kuaidaili.com/";
    /// <summary>
    /// 获取代理IP集合
    /// </summary>
    public List<string> ProxyIP
    {
        get { return ProcessHtml(HtmlHelper.GetHtmlCode(url)); }
    }
    private List<string> ProcessHtml(string html)
    {
        try
        {
            List<string> list = new List<string>();
            string regIP = "(\\d+.\\d+.\\d+.\\d+)\\s*</span>\\s*<span\\s*class=\"tbBottomLine\"\\s*style=\"width:50px;\">\\s*(\\d+)";
            //string regIP = @"<td>(\d+.\d+.\d+.\d+)</td>\s*<td>(\d+)</td>"; 对应url1
            DataTable dt = MatchHelper.MatchDt(html, regIP);
            if (dt != null && dt.Rows.Count > 0)
            {
                for (int i = 0; i < dt.Rows.Count; i++)
                {
                    string tempIP = dt.Rows[i][0].ToString() + ":" + dt.Rows[i][1].ToString();
                    list.Add(tempIP);
                }
            }
            return list;
        }
        catch (Exception ee)
        { 
            return null;
        }
    }
    #endregion
    #region 获取页面里面的链接信息
    /// <summary>
    /// 获取网页里的所有图片链接
    /// </summary>
    /// <param name="html">html源</param>
    /// <param name="host">当前 html 源网址中的主机名</param>
    /// <returns></returns>
    public static List<ItemImg> ItemImg(string html, string host = "")
    {
        try
        {
            string imgReg = "(<img\\s*[^>]*\\s*>)";
            List<ItemImg> ImgItem = new List<ItemImg>();
            List<string> ImgList = MatchHelper.MatchLists(html, imgReg);
            if (ImgList != null && ImgList.Count > 0)
            {
                string srcReg = "src=\"(\\S+)\"|src=\'(\\S+)\'|data-original=\"(\\S+)\"|data-original='(\\S+)'";
                string altReg = "alt=\"(\\S+)\"|alt=\'(\\S+)\'";
                for (int i = 0; i < ImgList.Count; i++)
                {
                    string _src = MatchHelper.MatchScalar(ImgList[i], srcReg);
                    if (FilterUrl(_src))
                    {
                        ItemImg _imgitem = new ItemImg();
                        string _alt = MatchHelper.MatchScalar(ImgList[i], altReg);
                        _src = FilterSrcUrl(_src, host);
                        if (_src.ToLower().Contains("http://") || _src.ToLower().Contains("https://"))
                        {
                            Uri uri = new Uri(_src);
                            _imgitem.ImgHost = uri.Host;
                        }
                        _imgitem.ImgSrc = _src;
                        _imgitem.ImgAlt = _alt;
                        _imgitem.ImgLable = ImgList[i];
                        ImgItem.Add(_imgitem);
                    }
                }
            }
            return ImgItem;
        }
        catch (Exception ee)
        { 
            return null;
        } 
    }
    /// <summary>
    /// 获取 html 源中所有 a 标签的链接信息
    /// </summary>
    /// <param name="html">html源</param>
    /// <param name="host">当前 html 源网址中的主机名</param>
    /// <returns></returns>
    internal static List<ItemA> ItemA(string html, string host = "")
    {
        try
        {
            List<ItemA> Item = new List<ItemA>();
            string aReg = "(<a\\s*[^<]*\\s*>\\s*[^<]*\\s*<\\s*/\\s*a\\s*>)";
            List<string> aList = MatchHelper.MatchLists(html, aReg);
            if (aList != null && aList.Count > 0)
            {
                string hrefReg = "href=\"(\\S+)\"|href='(\\S+)'";
                string title = "title=\"(\\S+)\"|title=\'(\\S+)\'";
                string titleShow = ">([^<]+)<";
                for (int i = 0; i < aList.Count; i++)
                {
                    string _url = MatchHelper.MatchScalar(aList[i], hrefReg).Replace("\"", "").Replace("'", "");
                    if (FilterUrl(_url))
                    {
                        ItemA _aitem = new ItemA();
                        string _title = MatchHelper.MatchScalar(aList[i], title);
                        string _content = MatchHelper.MatchScalar(aList[i], titleShow);
                        _url = FilterSrcUrl(_url, host);
                        if (_url.ToLower().Contains("http://") || _url.ToLower().Contains("https://"))
                        {
                            Uri uri = new Uri(_url);
                            _aitem.AHost = uri.Host;
                        }
                        _aitem.Ahref = _url;
                        _aitem.ATitle = _title;
                        _aitem.AContent = _content;
                        _aitem.ALable = aList[i];
                        Item.Add(_aitem);
                    }
                }
            }
            return Item;
        }
        catch (Exception ee)
        { 
            return null;
        }
    }
    /// <summary>
    /// 私有函数,过滤不合法的url
    /// </summary>
    /// <param name="url">待判别的url</param>
    /// <returns></returns>
    static bool FilterUrl(string url)
    {
        bool ok = true;
        if (url == "") ok = false;
        if (url.StartsWith("javascript:")) ok = false;
        if (url.StartsWith("#")) ok = false;
        return ok;
    }
    /// <summary>
    /// 判断url格式是否标准,不标准则将其标准话
    /// </summary>
    /// <param name="srcUrl">待判别的url</param>
    /// <param name="host">当前 html 源网址中的主机名</param>
    /// <returns></returns>
    static string FilterSrcUrl(string srcUrl, string host)
    {
        if (!srcUrl.ToLower().Contains("http://") && !srcUrl.ToLower().Contains("https://"))
        {
            if (host.EndsWith("/") && srcUrl.StartsWith("/"))
            {
                srcUrl = host + srcUrl;
                srcUrl = srcUrl.Replace("//", "/");
            }
            else if (!host.EndsWith("/") && !srcUrl.StartsWith("/"))
                srcUrl = host + "/" + srcUrl;
            else
                srcUrl = host + srcUrl;
            srcUrl = "http://" + srcUrl;
        }
        return srcUrl;
    }
    /// <summary>
    /// 获取 html 源中的图片链接,非img标签中的链接
    /// </summary>
    /// <param name="html">html源</param>
    /// <param name="host">当前 html 源网址中的主机名</param>
    /// <param name="type">
    /// <para>图片类型,可填</para>
    /// <para>1.jpg</para>
    /// <para>2.png</para>
    /// <para>3.bmp</para>
    /// <para>4.gif</para>
    /// <para>5.其他</para>
    /// <para>注意大小写,有可能因为大小写而导致无法匹配</para>
    /// </param>
    /// <returns></returns>
    public static List<string> ListImg(string html, string host = "", string type = "jpg")
    {
        try
        {
            string picReg = "[\"|']([-a-zA-Z0-9@:%_\\+.~#?&//=]+." + type + ")[\"|']";
            List<string> picList = MatchHelper.MatchLists(html, picReg);
            if (picList != null && picList.Count > 0)
            {
                if (host != "" && (host.Contains("http://") || host.Contains("https://")))
                {
                    for (int i = 0; i < picList.Count; i++)
                    {
                        if (!picList[i].Contains("http://") && !picList[i].Contains("https://"))
                        {
                            picList[i] = host + picList[i];
                        }
                    }
                }
            }
            return picList;
        }
        catch (Exception ee)
        { 
            return null;
        }
    }
    #endregion
    #region 页面源代码读取
    /// <summary>
    /// get方式读取数据
    /// </summary>
    /// <param name="strUrl">地址</param>
    /// <returns>返回数据</returns>
    public static string GetModel(string strUrl)
    {
        string strRet = null;
        try
        {
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);
            request.Timeout = 2000;
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            System.IO.Stream resStream = response.GetResponseStream();
            Encoding encode = System.Text.Encoding.UTF8;
            StreamReader readStream = new StreamReader(resStream, encode);
            Char[] read = new Char[256];
            int count = readStream.Read(read, 0, 256);
            while (count > 0)
            {
                String str = new String(read, 0, count);
                strRet = strRet + str;
                count = readStream.Read(read, 0, 256);
            }
            resStream.Close();
        }
        catch (Exception e)
        {
            strRet = "";
        }
        return strRet;
    }
    /// <summary>
    /// 提供通过POST方法获取页面的方法
    /// </summary>
    /// <param name="urlString">请求的URL</param>
    /// <param name="encoding">页面使用的编码</param>
    /// <param name="postDataString">POST数据</param>
    /// <param name="Method">Method方式</param>
    /// <returns>获取的页面</returns>
    public static string GetHtmlFromPost(string urlString, Encoding encoding, string postDataString)
    {
        //定义局部变量
        CookieContainer cookieContainer = new CookieContainer();
        HttpWebRequest httpWebRequest = null;
        HttpWebResponse httpWebResponse = null;
        Stream inputStream = null;
        Stream outputStream = null;
        StreamReader streamReader = null;
        string htmlString = string.Empty;
        //转换POST数据
        byte[] postDataByte = encoding.GetBytes(postDataString);
        //建立页面请求
        try
        {
            httpWebRequest = WebRequest.Create(urlString) as HttpWebRequest;
        }
        //处理异常
        catch (Exception ex)
        {
            //throw new Exception("建立页面请求时发生错误!", ex);
        }
        //指定请求处理方式
        httpWebRequest.Method = "POST";
        httpWebRequest.KeepAlive = false;
        httpWebRequest.ContentType = "application/x-www-form-urlencoded";
        httpWebRequest.CookieContainer = cookieContainer;
        httpWebRequest.ContentLength = postDataByte.Length;
        //向服务器传送数据
        try
        {
            inputStream = httpWebRequest.GetRequestStream();
            inputStream.Write(postDataByte, 0, postDataByte.Length);
        }
        //处理异常
        catch (Exception ex)
        {
            //throw new Exception("发送POST数据时发生错误!", ex);
        }
        finally
        {
            inputStream.Close();
        }
        //接受服务器返回信息
        try
        {
            httpWebResponse = httpWebRequest.GetResponse() as HttpWebResponse;
            outputStream = httpWebResponse.GetResponseStream();
            streamReader = new StreamReader(outputStream, encoding);
            htmlString = streamReader.ReadToEnd();
        }
        //处理异常
        catch (Exception ex)
        {
            //throw new Exception("接受服务器返回页面时发生错误!", ex);
        }
        finally
        {
            if (streamReader != null)
            {
                streamReader.Close();
            }
        }
        if (httpWebResponse != null)
        {
            foreach (Cookie cookie in httpWebResponse.Cookies)
            {
                cookieContainer.Add(cookie);
            }
        }
        return htmlString;
    }
    /// <summary>
    /// 通过GET方式获取页面的方法
    /// </summary>
    /// <param name="urlString">请求的URL</param>
    /// <param name="encoding">页面编码</param>
    /// <returns></returns>
    public static string GetHtmlFromGet(string urlString, Encoding encoding)
    {
        //定义局部变量
        HttpWebRequest httpWebRequest = null;
        HttpWebResponse httpWebRespones = null;
        Stream stream = null;
        string htmlString = string.Empty;
        //请求页面
        try
        {
            httpWebRequest = WebRequest.Create(urlString) as HttpWebRequest;
        }
        //处理异常
        catch (Exception ex)
        {
            //throw new Exception("建立页面请求时发生错误!", ex);
        }
        httpWebRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727; Maxthon 2.0)";
        //获取服务器的返回信息
        try
        {
            httpWebRespones = (HttpWebResponse)httpWebRequest.GetResponse();
            stream = httpWebRespones.GetResponseStream();
        }
        //处理异常
        catch (Exception ex)
        {
            //throw new Exception("接受服务器返回页面时发生错误!", ex);
        }
        StreamReader streamReader = new StreamReader(stream, encoding);
        //读取返回页面
        try
        {
            htmlString = streamReader.ReadToEnd();
        }
        //处理异常
        catch (Exception ex)
        {
            //throw new Exception("读取页面数据时发生错误!", ex);
        }
        //释放资源返回结果
        streamReader.Close();
        stream.Close();
        return htmlString;
    }
    #endregion
    #region 从QueryString截取参数
    /// <summary>
    /// 截取参数,取不到值时返回""
    /// </summary>
    /// <param name="s">不带?号的url参数</param>
    /// <param name="para">要取的参数</param>
    public static string QueryString(string s, string para)
    {
        if (string.IsNullOrEmpty(s))
        {
            return s;
        }
        s = s.Trim('?').Replace("%26", "&").Replace('?', '&');
        int num = s.Length;
        for (int i = 0; i < num; i++)
        {
            int startIndex = i;
            int num4 = -1;
            while (i < num)
            {
                char ch = s[i];
                if (ch == '=')
                {
                    if (num4 < 0)
                    {
                        num4 = i;
                    }
                }
                else if (ch == '&')
                {
                    break;
                }
                i++;
            }
            string str = null;
            string str2 = null;
            if (num4 >= 0)
            {
                str = s.Substring(startIndex, num4 - startIndex);
                str2 = s.Substring(num4 + 1, (i - num4) - 1);
                if (str == para)
                {
                    return System.Web.HttpUtility.UrlDecode(str2);
                }
            }
        }
        return "";
    }
    #endregion
    #region 模拟页面请求地址(可以使用在上传文件上)
    /// <summary>
    /// 同步方式发起http post请求,可以同时上传文件
    /// </summary>
    /// <param name="url">请求URL</param>
    /// <param name="queryString">请求参数字符串</param>
    /// <param name="files">上传文件列表</param>
    /// <returns>请求返回值</returns>
    public static string HttpPostWithFile(string url, string queryString, List<QueryParameter> files)
    {
        Stream requestStream = null;
        string responseData = null;
        string boundary = DateTime.Now.Ticks.ToString("x");
        HttpWebRequest webRequest = WebRequest.Create(url) as HttpWebRequest;
        webRequest.ServicePoint.Expect100Continue = false;
        webRequest.Timeout = 20000;
        webRequest.ContentType = "multipart/form-data;charset=utf-8;boundary=" + boundary;
        webRequest.Method = "POST";
        webRequest.KeepAlive = false;
        webRequest.Credentials = CredentialCache.DefaultCredentials;
        try
        {
            Stream memStream = new MemoryStream();
            byte[] beginBoundary = Encoding.UTF8.GetBytes("\r\n--" + boundary + "\r\n");
            byte[] endBoundary = Encoding.UTF8.GetBytes("\r\n--" + boundary + "--\r\n");
            // byte[] boundarybytes = System.Text.Encoding.ASCII.GetBytes("\r\n--" + boundary + "\r\n");
            // string formdataTemplate = "\r\n--" + boundary + "\r\nContent-Disposition: form-data; name=\"{0}\"\r\n\r\n{1}";
            string formdataTemplate = "Content-Disposition: form-data; name=\"{0}\"\r\n\r\n{1}";
            List<QueryParameter> listParams = GetQueryParameters(queryString);
            foreach (QueryParameter param in listParams)
            {
                // 写入头
                memStream.Write(beginBoundary, 0, beginBoundary.Length);
                string formitem = string.Format(formdataTemplate, param.Name, param.Value);
                byte[] formitembytes = Encoding.UTF8.GetBytes(formitem);
                memStream.Write(formitembytes, 0, formitembytes.Length);
            }
            // memStream.Write(boundarybytes, 0, boundarybytes.Length);
            string headerTemplate = "Content-Disposition: form-data; name=\"{0}\"; filename=\"{1}\"\r\nContent-Type: \"{2}\"\r\n\r\n";
            foreach (QueryParameter param in files)
            {
                string name = param.Name;
                string filePath = param.Value;
                string file = Path.GetFileName(filePath);
                string contentType = GetContentType(file);
                // 写入头
                memStream.Write(beginBoundary, 0, beginBoundary.Length);
                string header = string.Format(headerTemplate, name, file, contentType);
                byte[] headerbytes = System.Text.Encoding.UTF8.GetBytes(header);
                memStream.Write(headerbytes, 0, headerbytes.Length);
                FileStream fileStream = new FileStream(filePath, FileMode.Open, FileAccess.Read);
                byte[] buffer = new byte[1024];
                int bytesRead = 0;
                while ((bytesRead = fileStream.Read(buffer, 0, buffer.Length)) != 0)
                {
                    memStream.Write(buffer, 0, bytesRead);
                }
                // memStream.Write(boundarybytes, 0, boundarybytes.Length);
                // 写入结尾
                memStream.Write(endBoundary, 0, endBoundary.Length);
                fileStream.Close();
            }
            webRequest.ContentLength = memStream.Length;
            requestStream = webRequest.GetRequestStream();
            memStream.Position = 0;
            byte[] tempBuffer = new byte[memStream.Length];
            memStream.Read(tempBuffer, 0, tempBuffer.Length);
            memStream.Close();
            requestStream.Write(tempBuffer, 0, tempBuffer.Length);
        }
        catch
        {
            throw;
        }
        finally
        {
            requestStream.Close();
            requestStream = null;
        }
        try
        {
            responseData = WebResponseGet(webRequest);
            webRequest = null;
            return responseData;
        }
        catch (Exception ex)
        {
            throw ex;
        }
    }
    /// <summary>
    /// 获取返回结果http get请求
    /// </summary>
    /// <param name="webRequest">webRequest对象</param>
    /// <returns>请求返回值</returns>
    public static string WebResponseGet(HttpWebRequest webRequest)
    {
        try
        {
            HttpWebResponse httpWebResponse = (HttpWebResponse)webRequest.GetResponse();
            StreamReader responseReader = null;
            string responseData = String.Empty;
            responseReader = new StreamReader(webRequest.GetResponse().GetResponseStream());
            responseData = responseReader.ReadToEnd();
            webRequest.GetResponse().GetResponseStream().Close();
            responseReader.Close();
            responseReader = null;
            return responseData;
        }
        catch (Exception ex)
        {
            throw ex;
        }
    }
    /// <summary>
    /// ParseQueryString
    /// </summary>
    /// <param name="strValue"></param>
    /// <returns></returns>
    public static List<QueryParameter> GetQueryParameters(string strValue)
    {
        List<QueryParameter> list = new List<QueryParameter>();
        if (!string.IsNullOrEmpty(strValue))
        {
            foreach (var item in strValue.Trim(' ', '?', '&').Split('&'))
            {
                if (item.IndexOf('=') > -1)
                {
                    var temp = item.Split('=');
                    list.Add(new QueryParameter(temp[0], temp[1]));
                }
                else
                {
                    list.Add(new QueryParameter(item, string.Empty));
                }
            }
        }
        return list;
    }
    /// <summary>
    /// 字符串拼接
    /// </summary>
    /// <param name="paras"></param>
    /// <returns></returns>
    public static string GetQueryFromParas(List<QueryParameter> paras)
    {
        if (paras == null || paras.Count == 0)
            return "";
        StringBuilder sbList = new StringBuilder();
        int count = 1;
        foreach (QueryParameter para in paras)
        {
            sbList.AppendFormat("{0}={1}", para.Name, para.Value);
            if (count < paras.Count)
            {
                sbList.Append("&");
            }
            count++;
        }
        return sbList.ToString(); ;
    }
    /// <summary>
    /// 根据文件名获取文件类型
    /// </summary>
    /// <param name="fileName"></param>
    /// <returns></returns>
    public static string GetContentType(string fileName)
    {
        string contentType = "application/octetstream";
        string ext = Path.GetExtension(fileName).ToLower();
        RegistryKey registryKey = Registry.ClassesRoot.OpenSubKey(ext);
        if (registryKey != null && registryKey.GetValue("Content Type") != null)
        {
            contentType = registryKey.GetValue("Content Type").ToString();
        }
        return contentType;
    }
    /// <summary>
    /// Utc时间转本地时间,原格式:Wed Nov 17 15:07:48 +0800 2010
    /// </summary>
    /// <param name="strValue">原格式:Wed Nov 17 15:07:48 +0800 2010</param>
    /// <returns></returns>
    public static string UtcToDateTime(string strValue)
    {
        if (!string.IsNullOrEmpty(strValue))
        {
            //原格式:Wed Nov 17 15:07:48 +0800 2010
            string[] str = strValue.Split(' ');
            //转格式:Wed Nov 17 2010 15:07:48
            return str[0] + " " + str[1] + " " + str[2] + " " + str[5] + " " + str[3];
        }
        else
        {
            return "";
        }
    } 
    #endregion
}
public class ItemA
{
    /// <summary>
    /// a 标签中的链接
    /// </summary>
    public string Ahref { get; set; }
    /// <summary>
    /// a 标签中的标题
    /// </summary>
    public string ATitle { get; set; }
    /// <summary>
    /// a 标签中的内容
    /// </summary>
    public string AContent { get; set; }
    /// <summary>
    /// a 标签中的链接的主机名
    /// </summary>
    public string AHost { get; set; }
    /// <summary>
    /// a 标签
    /// </summary>
    public string ALable { get; set; }
}
public class ItemImg
{
    /// <summary>
    /// Img 标签中的链接
    /// </summary>
    public string ImgSrc { get; set; }
    /// <summary>
    /// Img 标签中的替代文本
    /// </summary>
    public string ImgAlt { get; set; }
    /// <summary>
    /// Img 标签链接主机名
    /// </summary>
    public string ImgHost { get; set; }
    /// <summary>
    /// Img 标签
    /// </summary>
    public string ImgLable { get; set; }
}
/// <summary>
/// QueryParameter
/// </summary>
public class QueryParameter
{
    private string name = string.Empty;
    private string value = string.Empty;
    public QueryParameter(string name, string value)
    {
        this.name = name;
        this.value = value;
    }
    public QueryParameter(string name, object value)
    {
        this.name = name;
        this.value = value.ToString();
    }
    public string Name
    {
        get { return name == null ? string.Empty : name.Trim(); }
    }
    public string Value
    {
        get { return value == null ? string.Empty : value.Trim(); }
    }
}


« 上一篇下一篇 »

发表评论:

◎欢迎参与讨论,请在这里发表您的看法、交流您的观点。