Python:只写最后一行输出 [英] Python: Only writes last line of output
本文介绍了Python:只写最后一行输出的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!
问题描述
pre $ code $ import $ $ b $ import urllib.request
#从网站的HTML源代码
def extractUrls(url,unique = True,sort = True,restrictToTld = None):
#Prependwww。如果不存在
如果url [0:4]!=www。:
url =.join([www。,url])
#打开连接
与urllib.request.urlopen(http://+ url)作为h:
#获取标题
headers = h.info()
#默认字符集
charset =ISO-8859-1
#如果一个字符集在头文件中,那么在头文件中覆盖缺省的
:
match = re.search(rcharset = ([\ w\ - ] +),headers [i],re.I)
if match!= None:
charset = match.group(1).lower()
$ break
#抓取并解码源代码
source = h.read()。decode(charset)
#查找源代码中的所有URL
matches = re。的findall(R http\:?\ / \ /(WWW)([A-z0-9\-\] + \ [AZ] {2,6-})\b ,source,re.I)
#如果找不到网址,则中止
如果匹配==无:
返回无
#收集URLs
collection = []
#在网址中逐个浏览
匹配网址:
url = url [1] .lower()
#如果有多个点,则URL包含
#subdomain(s),如果url.count(。)>删除了
, 1:
temp = url.split(。)
tld = temp.pop()
url =.join([temp.pop(),。,tld ])
#如果设置了一个,则限制为TLD
如果restrictToTld:
tld = url.split(。)。pop()
if tld!= restrictToTld:
continue
#如果只有唯一的URL应该返回
如果唯一:
如果url不在集合中:
collection.append(url)
#否则只是将URL添加到集合中
else else
collection.append(url)
$ do
return sort(collection)if sort else collection
#测试
url =msn.com
print(Parent:,url)
for extractUrls(url):
print( - ,x)
$ bf = open(f2.txt,w +,1)
f.write(x)
f.close()
输出结果是:
父母:msn.com
- 2o7.net
- atdmt.com
- bing.com
- careerbuilder.com
- delish.com
- discoverbing.com
- discovermsn.com
- facebook.com
- foxsports.com
- foxsportsarizona。 com
- foxsportssouthwest.com
- icra.org
- live.com
- microsoft.com
- msads.net
- msn.com
- msnrewards.com
- myhomemsn.com
- nbcnews.com
- northjersey.com
- outlook.com
- revsci.net
- rsac.org
- s-msn.com
- scorecardresearch.com
- skype.com
- twitter.com
- w3.org
- yardbarker.com
[完成于0.8s]
只有yardbarker.com写入文件。
解决方案
url =msn.com
print(Parent:,url)
f = open(f2.txt,w,)
for extractUrls(url):
print( ,x)
f.write(x)
f.close()
Trying to write a program that extracts URLs from a website. The output is good, but when I try to write the output to a file, only the last record is written. Here is the code:
import re
import urllib.request
# Retrieves URLs from the HTML source code of a website
def extractUrls(url, unique=True, sort=True, restrictToTld=None):
# Prepend "www." if not present
if url[0:4] != "www.":
url = "".join(["www.",url])
# Open a connection
with urllib.request.urlopen("http://" + url) as h:
# Grab the headers
headers = h.info()
# Default charset
charset = "ISO-8859-1"
# If a charset is in the headers then override the default
for i in headers:
match = re.search(r"charset=([\w\-]+)", headers[i], re.I)
if match != None:
charset = match.group(1).lower()
break
# Grab and decode the source code
source = h.read().decode(charset)
# Find all URLs in the source code
matches = re.findall(r"http\:\/\/(www.)?([a-z0-9\-\.]+\.[a-z]{2,6})\b", source, re.I)
# Abort if no URLs were found
if matches == None:
return None
# Collect URLs
collection = []
# Go over URLs one by one
for url in matches:
url = url[1].lower()
# If there are more than one dot then the URL contains
# subdomain(s), which we remove
if url.count(".") > 1:
temp = url.split(".")
tld = temp.pop()
url = "".join([temp.pop(),".",tld])
# Restrict to TLD if one is set
if restrictToTld:
tld = url.split(".").pop()
if tld != restrictToTld:
continue
# If only unique URLs should be returned
if unique:
if url not in collection:
collection.append(url)
# Otherwise just add the URL to the collection
else:
collection.append(url)
# Done
return sorted(collection) if sort else collection
# Test
url = "msn.com"
print("Parent:", url)
for x in extractUrls(url):
print("-", x)
f = open("f2.txt", "w+", 1)
f.write( x )
f.close()
The output is:
Parent: msn.com
- 2o7.net
- atdmt.com
- bing.com
- careerbuilder.com
- delish.com
- discoverbing.com
- discovermsn.com
- facebook.com
- foxsports.com
- foxsportsarizona.com
- foxsportssouthwest.com
- icra.org
- live.com
- microsoft.com
- msads.net
- msn.com
- msnrewards.com
- myhomemsn.com
- nbcnews.com
- northjersey.com
- outlook.com
- revsci.net
- rsac.org
- s-msn.com
- scorecardresearch.com
- skype.com
- twitter.com
- w3.org
- yardbarker.com
[Finished in 0.8s]
Only "yardbarker.com" is written to the file. I appreciate the help, thank you.
解决方案
url = "msn.com"
print("Parent:", url)
f = open("f2.txt", "w",)
for x in extractUrls(url):
print("-", x)
f.write( x )
f.close()
这篇关于Python:只写最后一行输出的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!
查看全文