如何在Java中处理非UTF8 html页面? [英] How to handle non-UTF8 html page in Java?
问题描述
我知道如何使用HttpUrlConnection& InputStream获取字符串。
但是,我有一些页面的编码问题。
如果有些页面具有不同的编码(例如,GB2312),除了UTF8,我得到的字符串只是任意的字符或问号。
任何一个请告诉我如何解决这个问题问题?
谢谢
下面是我从一个url下载html的代码。 b
$ b
private String downloadHtml(String urlString){
URL url = null;
InputStream inStr = null;
StringBuffer buffer = new StringBuffer();
try {
url = new URL(urlString);
HttpURLConnection conn =(HttpURLConnection)url.openConnection(); // Cast不应该失败
HttpURLConnection.setFollowRedirects(true);
//允许GZip和Deflate(ZLib)编码
//conn.setRequestProperty(\"Accept-Encoding,gzip,deflate);
String encoding = conn.getContentEncoding();
inStr = null;
//根据
创建适当的流包装器//编码类型
if(encoding!= null&& encoding.equalsIgnoreCase(gzip)){
inStr = new GZIPInputStream(conn.getInputStream());
} else if(encoding!= null&& encoding.equalsIgnoreCase(deflate)){
inStr = new InflaterInputStream(conn.getInputStream(),
new Inflater(true) );
} else {
inStr = conn.getInputStream();
}
int ptr = 0;
InputStreamReader inStrReader = new InputStreamReader(inStr,Charset.forName(GB2312));
while((ptr = inStrReader.read())!= -1){
buffer.append((char)ptr);
}
inStrReader.close();
conn.disconnect();
}
catch(异常e){
e.printStackTrace();
}
finally {
if(inStr!= null)
try {
inStr.close();
} catch(IOException e){
// TODO自动生成的catch块
e.printStackTrace();
}
}
return buffer.toString();
}
使用 InputStreamReader ,并指定您的字符集,如下所示:
inStr = new InputStreamReader(InputStream,Charset .forName( GB2312));
以下代码适用于我:
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.zip.GZIPInputStream;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;
public class Foo {
public static void main(String [] args){
System.out.println(downloadHtml(http:// baike。 baidu.com/view/6000001.htm));
}
private static String downloadHtml(String urlString){
URL url = null;
InputStream inStr = null;
StringBuffer buffer = new StringBuffer();
try {
url = new URL(urlString);
HttpURLConnection conn =(HttpURLConnection)url.openConnection(); // Cast不应该失败
HttpURLConnection.setFollowRedirects(true);
//允许GZip和Deflate(ZLib)编码
//conn.setRequestProperty(\"Accept-Encoding,gzip,deflate);
String encoding = conn.getContentEncoding();
inStr = null;
//根据
创建适当的流包装器//编码类型
if(encoding!= null&& encoding.equalsIgnoreCase(gzip)){
inStr = new GZIPInputStream(conn.getInputStream());
} else if(encoding!= null&& encoding.equalsIgnoreCase(deflate)){
inStr = new InflaterInputStream(conn.getInputStream(),
new Inflater(true) );
} else {
inStr = conn.getInputStream();
}
int ptr = 0;
InputStreamReader inStrReader = new InputStreamReader(inStr,Charset.forName(GB2312));
while((ptr = inStrReader.read())!= -1){
buffer.append((char)ptr);
}
inStrReader.close();
conn.disconnect();
}
catch(异常e){
e.printStackTrace();
}
finally {
if(inStr!= null)
try {
inStr.close();
} catch(IOException e){
// TODO自动生成的catch块
e.printStackTrace();
}
}
return buffer.toString();
}
}
My task is to retrieve html strings from urls using Java.
I know how to using HttpUrlConnection & InputStream to get the string.
However, I have an encoding problem for some pages.
If some pages have different encoding (e.g., GB2312), other than UTF8, the string I get is just arbitrary chars or question marks.
Can any one please tell me how to solve this problem?
Thanks
Below is my code to download the html from a url.
private String downloadHtml(String urlString) {
URL url = null;
InputStream inStr = null;
StringBuffer buffer = new StringBuffer();
try {
url = new URL(urlString);
HttpURLConnection conn = (HttpURLConnection) url.openConnection(); // Cast shouldn't fail
HttpURLConnection.setFollowRedirects(true);
// allow both GZip and Deflate (ZLib) encodings
//conn.setRequestProperty("Accept-Encoding", "gzip, deflate");
String encoding = conn.getContentEncoding();
inStr = null;
// create the appropriate stream wrapper based on
// the encoding type
if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
inStr = new GZIPInputStream(conn.getInputStream());
} else if (encoding != null && encoding.equalsIgnoreCase("deflate")) {
inStr = new InflaterInputStream(conn.getInputStream(),
new Inflater(true));
} else {
inStr = conn.getInputStream();
}
int ptr = 0;
InputStreamReader inStrReader = new InputStreamReader(inStr, Charset.forName("GB2312"));
while ((ptr = inStrReader.read()) != -1) {
buffer.append((char)ptr);
}
inStrReader.close();
conn.disconnect();
}
catch(Exception e) {
e.printStackTrace();
}
finally {
if (inStr != null)
try {
inStr.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
return buffer.toString();
}
By using an InputStreamReader and specifying your charset, like so:
inStr = new InputStreamReader(InputStream, Charset.forName("GB2312"));
The following code worked for me:
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.zip.GZIPInputStream;
import java.util.zip.Inflater;
import java.util.zip.InflaterInputStream;
public class Foo {
public static void main(String[] args) {
System.out.println(downloadHtml("http://baike.baidu.com/view/6000001.htm"));
}
private static String downloadHtml(String urlString) {
URL url = null;
InputStream inStr = null;
StringBuffer buffer = new StringBuffer();
try {
url = new URL(urlString);
HttpURLConnection conn = (HttpURLConnection) url.openConnection(); // Cast shouldn't fail
HttpURLConnection.setFollowRedirects(true);
// allow both GZip and Deflate (ZLib) encodings
//conn.setRequestProperty("Accept-Encoding", "gzip, deflate");
String encoding = conn.getContentEncoding();
inStr = null;
// create the appropriate stream wrapper based on
// the encoding type
if (encoding != null && encoding.equalsIgnoreCase("gzip")) {
inStr = new GZIPInputStream(conn.getInputStream());
} else if (encoding != null && encoding.equalsIgnoreCase("deflate")) {
inStr = new InflaterInputStream(conn.getInputStream(),
new Inflater(true));
} else {
inStr = conn.getInputStream();
}
int ptr = 0;
InputStreamReader inStrReader = new InputStreamReader(inStr, Charset.forName("GB2312"));
while ((ptr = inStrReader.read()) != -1) {
buffer.append((char)ptr);
}
inStrReader.close();
conn.disconnect();
}
catch(Exception e) {
e.printStackTrace();
}
finally {
if (inStr != null)
try {
inStr.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
return buffer.toString();
}
}
这篇关于如何在Java中处理非UTF8 html页面?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!