using System;using System.Net;using System.Text;using System.Text.RegularExpressions;internal class Program{ // 获取网页的HTML内容,根据网页的charset自动判断Encoding public static string GetHtml(string url) { return GetHtml(url, null); } // 获取网页的HTML内容,指定Encoding private static string GetHtml(string url, Encoding encoding) { byte[] buf = new WebClient().DownloadData(url); if (encoding != null) return encoding.GetString(buf); string html = Encoding.UTF8.GetString(buf); encoding = GetEncoding(html); if (encoding == null || encoding == Encoding.UTF8) return html; return encoding.GetString(buf); } // 根据网页的HTML内容提取网页的Encoding private static Encoding GetEncoding(string html) { string pattern = @"(?i)\bcharset=(? [-a-zA-Z_0-9]+)"; string charset = Regex.Match(html, pattern).Groups["charset"].Value; try { return Encoding.GetEncoding(charset); } catch (ArgumentException) { return null; } } // 根据网页的HTML内容提取网页的Title private static string GetTitle(string html) { string pattern = @"(?si) ])*)?>(? .*?)"; return Regex.Match(html, pattern).Groups["title"].Value.Trim(); } // 打印网页的Encoding和Title private static void PrintEncodingAndTitle(string url) { string html = GetHtml(url); Console.WriteLine("[{0}] [{1}]", GetEncoding(html), GetTitle(html)); } // 程序入口 private static void Main() { PrintEncodingAndTitle("http://www.msdn.net/"); PrintEncodingAndTitle("http://www.cnblogs.com/"); PrintEncodingAndTitle("http://www.cnblogs.com/skyiv/"); PrintEncodingAndTitle("http://www.csdn.net/"); PrintEncodingAndTitle("http://news.163.com/"); }}/* 程序输出:[] [MSDN: Microsoft Developer Network][System.Text.UTF8Encoding] [博客园 - 程序员的网上家园][System.Text.UTF8Encoding] [空间/IV - 博客园][System.Text.UTF8Encoding] [CSDN.NET - 中国最大的IT技术社区,为IT专业技术人员提供最全面的信息传播和服务平台][System.Text.DBCSCodePageEncoding] [新闻中心_网易新闻]*/
转自: