最近在做一个页面采集的过程中发现,页面抓取后乱码,而且时好时不好。然后发现编码也没有问题,原来是GZIP压缩导致的。
在朋友们的热心帮助下终于解决了。下面就贴代码吧,抓取gzip及其它页面防止乱码。
核心代码如下:
using (HttpWebResponse response = (HttpWebResponse)req.GetResponse())
{
if (response.ContentEncoding.ToLower().Contains("gzip"))
{
using (GZipStream stream = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress))
{
using (StreamReader reader = new StreamReader(stream, Encoding.UTF8))
{
sHTML = reader.ReadToEnd();
}
}
}
else if (response.ContentEncoding.ToLower().Contains("deflate"))
{
using (DeflateStream stream = new DeflateStream(response.GetResponseStream(), CompressionMode.Decompress))
{
using (StreamReader reader = new StreamReader(stream, Encoding.UTF8))
{
sHTML = reader.ReadToEnd();
}
}
}
else
{
using (Stream stream = response.GetResponseStream())
{
using (StreamReader reader = new StreamReader(stream, Encoding.UTF8))
{
sHTML = reader.ReadToEnd();
}
}
}
}
完整前台代码gethtml.aspx
<%@ Page Language="C#" AutoEventWireup="true" CodeFile="gethtml.aspx.cs" Inherits="gethtml" ValidateRequest="false" %>
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head runat="server">
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
<title>抓取页面</title>
</head>
<body>
<form id="form1" runat="server">
<div>
url地址:<asp:TextBox ID="url" runat="server" Text="http://www.baidu.com" style="width:400px;"></asp:TextBox><asp:Button ID="Button1" runat="server" Text="抓取" OnClick="Button1_Click" /><br />
<textarea name="code" id="code" runat="server" style="width:530px;height:300px;"></textarea>
</div>
</form>
</body>
</html>
完整后台代码gethtml.aspx.cs
using System;
using System.Net;
using System.IO;
using System.Text;
using System.IO.Compression;
public partial class gethtml : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
}
public static string GetHtmlWithUtf(string url)
{
if (!(url.Contains("http://") || url.Contains("https://")))
{
url = "http://" + url;
}
HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(url);
req.UserAgent = "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)";
req.Accept = "*/*";
req.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
req.ContentType = "text/xml";
string sHTML = string.Empty;
using (HttpWebResponse response = (HttpWebResponse)req.GetResponse())
{
if (response.ContentEncoding.ToLower().Contains("gzip"))
{
using (GZipStream stream = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress))
{
using (StreamReader reader = new StreamReader(stream, Encoding.UTF8))
{
sHTML = reader.ReadToEnd();
}
}
}
else if (response.ContentEncoding.ToLower().Contains("deflate"))
{
using (DeflateStream stream = new DeflateStream(response.GetResponseStream(), CompressionMode.Decompress))
{
using (StreamReader reader = new StreamReader(stream, Encoding.UTF8))
{
sHTML = reader.ReadToEnd();
}
}
}
else
{
using (Stream stream = response.GetResponseStream())
{
using (StreamReader reader = new StreamReader(stream, Encoding.UTF8))
{
sHTML = reader.ReadToEnd();
}
}
}
}
return sHTML;
}
protected void Button1_Click(object sender, EventArgs e)
{
string urlstr = url.Text;
code.InnerHtml = GetHtmlWithUtf(urlstr);
}
}
(完)
大家有什么问题或技术上的想法可以在此与大家分享,也可以加入前端爱好者QQ群(141999928)一起学习进步:
【幸凡前端技术交流群】
如果您觉得本文的内容对您的学习有所帮助,捐赠与共勉,支付宝(左)或微信(右)