减少 Python 脚本中的 RAM 使用量 [英] Reduce RAM usage in Python script

查看:49
本文介绍了减少 Python 脚本中的 RAM 使用量的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我编写了一个快速小程序,用于从包含图书翻译信息的联合国教科文组织网站上抓取图书数据.代码正在做我想要它做的事情,但是当它处理大约 20 个国家时,它使用了大约 6GB 的 RAM.由于我需要处理大约 200 个,因此这对我不起作用.

我不确定所有 RAM 使用量来自哪里,所以我不确定如何减少它.我假设它是保存所有书籍信息的字典,但我并不肯定.我不确定是否应该让程序为每个国家/地区运行一次,而不是处理其中的很多?或者是否有更好的方法来做到这一点?

这是我第一次写这样的东西,我是一个相当新手,自学成才的程序员,所以请指出代码中的任何重大缺陷,或者你有可能不直接相关的改进技巧到手头的问题.

这是我的代码,在此先感谢您的帮助.

from __future__ import print_function导入 urllib2, os从 bs4 导入 BeautifulSoup,SoupStrainer''' 设置国家列表及其代码以方便解释什么实际上是在程序运行时进行的.'''国家 = {"AFG":"阿富汗","ALA":"奥兰群岛","DZA":"阿尔及利亚"}'''国家代码列表,因为字典没有按任何顺序排序方式,这使得处理在失败时更容易处理某个时候,中途.'''country_code_list = ["AFG","ALA","DZA"]base_url = "http://www.unesco.org/xtrans/bsresult.aspx?lg=0&c="destination_directory = "/Users/robbie/Test/"only_restable = SoupStrainer(class_="restable")类书(对象):def set_author(self,book):'''解析网页以查找作者姓名.查找姓氏,然后原作者的名字并设置 Book 对象的结果字符串的作者属性.'''作者 = ""author_last_names = book.find_all('span',class_="sn_auth_name")author_first_names = book.find_all('span', attrs={\'class':"sn_auth_first_name"})if author_last_names == []: self.Author = [" "]对于author_last_names 中的作者:尝试:first_name = author_first_names.pop()作者 = 作者 + author.getText() + ', ' + \first_name.getText()除了索引错误:作者 = 作者 + (author.getText())self.author = 作者def set_quality(self,book):''' 检查书页是否使用质量,如果所以.'''quality = book.find_all('span', class_="sn_auth_quality")如果 len(quality) == 0: self.quality = " "否则: self.quality = quality[0].contents[0]def set_target_title(self,book):target_title = book.find_all('span', class_="sn_target_title")如果 len(target_title) == 0: self.target_title = " "否则: self.target_title = target_title[0].contents[0]def set_target_language(self,book):target_language = book.find_all('span', class_="sn_target_lang")如果 len(target_language) == 0: self.target_language = " "否则:self.target_language = target_language[0].contents[0]def set_translator_name(self,book) :译者 = ""translate_last_names = book.find_all('span', class_="sn_transl_name")translate_first_names = book.find_all('span', \class_="sn_transl_first_name")如果translator_first_names == [] 和translator_last_names == [] :self.translators = " "返回无对于 translate_last_names 中的翻译器:尝试:first_name = translate_first_names.pop()译者 = 译者 + \(translator.getText() + ',' \+ first_name.getText())除了索引错误:译者 = 译者 + \(translator.getText())self.translators = 译者def set_published_city(self,book) :published_city = book.find_all('span', class_="place")如果 len(published_city) == 0:self.published_city = " "否则:self.published_city =published_city[0].contents[0]def set_publisher(self,book) :出版商 = book.find_all('span', class_="place")如果 len(出版商) == 0:self.publisher = " "否则:self.publisher = 发布者[0].contents[0]def set_published_country(self,book) :出版国家 = book.find_all('span', \class_="sn_country")如果 len(published_country) == 0:self.published_country = " "否则:self.published_country =published_country[0].contents[0]def set_year(self,book) :year = book.find_all('span', class_="sn_year")如果 len(year) == 0:self.year = " "否则:self.year = year[0].contents[0]def set_pages(self,book) :pages = book.find_all('span', class_="sn_pagination")如果 len(pages) == 0:self.pages = " "否则:self.pages = pages[0].contents[0]def set_edition(self, book) :edition = book.find_all('span', class_="sn_editionstat")如果 len(版本) == 0:self.edition = " "否则:self.edition = edition[0].contents[0]def set_original_title(self,book) :original_title = book.find_all('span', class_="sn_orig_title")如果 len(original_title) == 0:self.original_title = " "否则: self.original_title = original_title[0].contents[0]def set_original_language(self,book) :语言 = ''original_languages = book.find_all('span', \class_="sn_orig_lang")对于 original_languages 中的语言:语言 = 语言 + language.getText() + ', 'self.original_languages = 语言def 出口(自我,国家):''' 函数使我们可以轻松地从Book 对象的属性的内容并将它们写入图书出版所在国家/地区的 CSV 文件.'''file_name = os.path.join(destination_directory + country + ".csv")使用 open(file_name, "a") 作为 by_country_csv:打印(self.author.encode('UTF-8') + " & " + \self.quality.encode('UTF-8') + " & " + \self.target_title.encode('UTF-8') + " & " + \self.target_language.encode('UTF-8') + " & " + \self.translators.encode('UTF-8') + " & " + \self.published_city.encode('UTF-8') + " & " + \self.publisher.encode('UTF-8') + " & " + \self.published_country.encode('UTF-8') + " & " + \self.year.encode('UTF-8') + " & " + \self.pages.encode('UTF-8') + " & " + \self.edition.encode('UTF-8') + " & " + \self.original_title.encode('UTF-8') + " & " + \self.original_languages.encode('UTF-8'), file=by_country_csv)by_country_csv.close()def __init__(self, book, country):''' 通过为其提供 HTML 来初始化 Book 对象排'''self.set_author(书)self.set_quality(书)self.set_target_title(书)self.set_target_language(书)self.set_translator_name(book)self.set_published_city(book)self.set_publisher(书)self.set_published_country(book)self.set_year(书)self.set_pages(书)self.set_edition(书)self.set_original_title(书)self.set_original_language(书)def get_all_pages(country,base_url):''' 通过添加 ISO_3166-1_alpha-3 创建要抓取的 URL 列表国家代码到 URL,然后每 10 次迭代一次结果页.返回一个字符串.'''base_page = urllib2.urlopen(base_url+country)page = BeautifulSoup(base_page, parse_only=only_restable)result_number = page.find_all('td',class_="res1",limit=1)如果不是 result_number:返回 0str_result_number = str(result_number[0].getText())results_total = int(str_result_number.split('/')[1])page.decompose()返回 results_totaldef build_list(country_code_list, countries):''' 构建所有书籍的列表,并返回 Book 对象的列表万一你想和他们一起做别的事情,永远.'''对于 country_code_list 中的国家:print("正在处理 %s..." % countries[country])results_total = get_all_pages(country, base_url)对于范围内的网址(results_total):如果网址 % 10 == 0 :all_books = []target_page = urllib2.urlopen(base_url + country \+"&fr="+str(url))page = BeautifulSoup(target_page, parse_only=only_restable)book = page.find_all('td',class_="res2")对于书中书:all_books.append(Book (book,country))page.decompose()对于 all_books 中的标题:标题.出口(国家)返回如果 __name__ == "__main__":build_list(country_code_list,countries)打印(完成.")

解决方案

我想我会不按特定顺序列出一些问题或可能的改进:

  1. 遵循 PEP8.

    现在,您有很多使用驼峰命名的变量和函数,例如 setAuthor.这不是 Python 的传统风格;Python 通常会将其命名为 set_author(和 published_country 而不是 PublishedCountry 等).您甚至可以更改您正在调用的某些内容的名称:首先,BeautifulSoup 支持 findAll 以实现兼容性,但建议使用 find_all.

    除了命名之外,PEP 8 还规定了一些其他的东西;例如,你想重写这个:

    if len(resultNumber) == 0 : return 0

    如下:

    如果 len(result_number) == 0:返回 0

    甚至考虑到空列表是假的这一事实:

    如果不是 result_number:返回 0

  2. SoupStrainer 传递给 BeautifulSoup.

    您要查找的信息可能只是文档的一部分;你不需要把整个事情解析成一棵树.传递一个SoupStrainer 作为 BeautifulSoupparse_only 参数.这应该通过尽早丢弃不必要的部分来减少内存使用.

  3. 分解 喝完汤.

    Python 主要使用引用计数,因此删除所有循环引用(如 decompose 所做的那样)应该让其垃圾收集、引用计数的主要机制释放大量记忆.Python 也有一个半传统的垃圾收集器来处理循环引用,但引用计数要快得多.

  4. 不要让 Book.__init__ 将东西写入磁盘.

    在大多数情况下,我不希望仅仅创建一个类的实例来将某些内容写入磁盘.删除对 export 的调用;让用户调用 export 如果他们想把它放在磁盘上.

  5. 停止在内存中保留这么多数据.

    您将所有这些数据积累到字典中只是为了以后将其导出.减少内存的显而易见的事情是尽快将其转储到磁盘.你的评论表明你把它放在字典里是为了灵活;但这并不意味着您必须将其全部收集在一个列表中:使用生成器,在您抓取它们时产生项目.然后用户可以像列表一样迭代它:

     for book in scrape_books():书.出口()

    …但优点是一次最多只能保存一本书.

  6. 使用os.path中的函数 而不是自己修改路径.

    当涉及到路径名时,您现在的代码相当脆弱.如果我不小心从 destinationDirectory 中删除了尾部斜杠,则会发生意外.使用 os.path.join 防止这种情况发生并处理跨平台差异:

    <预><代码>>>>os.path.join("/Users/robbie/Test/", "美国")'/用户/罗比/测试/美国'>>>os.path.join("/Users/robbie/Test", "USA") # 仍然有效!'/用户/罗比/测试/美国'>>># 或者说我们在 Windows 上:>>>os.path.join(r"C:\Documents and Settings\robbie\Test", "USA")'C:\\Documents and Settings\\robbie\\Test\\USA'

  7. attrs={"class":...} 缩写为 class_=....

    BeautifulSoup 4.1.2 引入了使用 class_ 进行搜索,从而不再需要冗长的 attrs={"class":...}.

我想您可以更改的内容还有很多,但一开始就这么多.

I've written a quick little program to scrape book data off of a UNESCO website which contains information about book translations. The code is doing what I want it to, but by the time it's processed about 20 countries, it's using ~6GB of RAM. Since there are around 200 I need to process, this isn't going to work for me.

I'm not sure where all the RAM usage is coming from, so I'm not sure how to reduce it. I'm assuming that it's the dictionary that's holding all the book information, but I'm not positive. I'm not sure if I should simply make the program run once for each country, rather than processing the lot of them? Or if there's a better way to do it?

This is the first time I've written anything like this, and I'm a pretty novice, self-taught programmer, so please point out any significant flaws in the code, or improvement tips you have that may not directly relate to the question at hand.

This is my code, thanks in advance for any assistance.

from __future__ import print_function
import urllib2, os
from bs4 import BeautifulSoup, SoupStrainer

''' Set list of countries and their code for niceness in explaining what
is actually going on as the program runs. '''
countries = {"AFG":"Afghanistan","ALA":"Aland Islands","DZA":"Algeria"}

'''List of country codes since dictionaries aren't sorted in any
way, this makes processing easier to deal with if it fails at
some point, mid run.'''
country_code_list = ["AFG","ALA","DZA"]

base_url = "http://www.unesco.org/xtrans/bsresult.aspx?lg=0&c="
destination_directory = "/Users/robbie/Test/"
only_restable = SoupStrainer(class_="restable")

class Book(object):
    def set_author(self,book): 
        '''Parse the webpage to find author names. Finds last name, then
        first name of original author(s) and sets the Book object's 
        Author attribute to the resulting string.'''

        authors = ""
        author_last_names = book.find_all('span',class_="sn_auth_name")
        author_first_names = book.find_all('span', attrs={\
            'class':"sn_auth_first_name"})
        if author_last_names == []: self.Author = [" "]

        for author in author_last_names:
            try: 
                first_name = author_first_names.pop()
                authors = authors + author.getText() + ', ' + \
                    first_name.getText()

            except IndexError:
                authors = authors + (author.getText())
        self.author = authors

    def set_quality(self,book):
        ''' Check to see if book page is using Quality, then set it if 
        so.'''

        quality = book.find_all('span', class_="sn_auth_quality")

        if len(quality) == 0: self.quality = " "

        else: self.quality = quality[0].contents[0]

    def set_target_title(self,book): 
        target_title = book.find_all('span', class_="sn_target_title")
        if len(target_title) == 0: self.target_title = " "
        else: self.target_title = target_title[0].contents[0]

    def set_target_language(self,book): 
        target_language = book.find_all('span', class_="sn_target_lang")
        if len(target_language) == 0: self.target_language = " "
        else: self.target_language = target_language[0].contents[0]

    def set_translator_name(self,book) : 
        translators = ""
        translator_last_names = book.find_all('span', class_="sn_transl_name")
        translator_first_names = book.find_all('span', \
                                               class_="sn_transl_first_name")
        if translator_first_names == [] and translator_last_names == [] :
            self.translators = " "
            return None

        for translator in translator_last_names:
            try: 
                first_name = translator_first_names.pop()
                translators = translators + \
                    (translator.getText() + ',' \
                     + first_name.getText())
            except IndexError:
                translators = translators + \
                    (translator.getText())

        self.translators = translators  

    def set_published_city(self,book) : 
        published_city = book.find_all('span', class_="place")
        if len(published_city) == 0: 
            self.published_city = " "
        else: self.published_city = published_city[0].contents[0]

    def set_publisher(self,book) : 
        publisher = book.find_all('span', class_="place")
        if len(publisher) == 0: 
            self.publisher = " "
        else: self.publisher = publisher[0].contents[0] 

    def set_published_country(self,book) : 
        published_country = book.find_all('span', \
                                        class_="sn_country")
        if len(published_country) == 0: 
            self.published_country = " "
        else: self.published_country = published_country[0].contents[0]

    def set_year(self,book) : 
        year = book.find_all('span', class_="sn_year")
        if len(year) == 0: 
            self.year = " "
        else: self.year = year[0].contents[0]   

    def set_pages(self,book) : 
        pages = book.find_all('span', class_="sn_pagination")
        if len(pages) == 0: 
            self.pages = " "
        else: self.pages = pages[0].contents[0] 

    def set_edition(self, book) :
        edition = book.find_all('span', class_="sn_editionstat")
        if len(edition) == 0: 
            self.edition = " "
        else: self.edition = edition[0].contents[0]

    def set_original_title(self,book) : 
        original_title = book.find_all('span', class_="sn_orig_title")
        if len(original_title) == 0: 
            self.original_title = " "
        else: self.original_title = original_title[0].contents[0]   

    def set_original_language(self,book) :
        languages = ''
        original_languages = book.find_all('span', \
                                         class_="sn_orig_lang")

        for language in original_languages:
            languages = languages + language.getText() + ', '

        self.original_languages = languages

    def export(self, country): 
        ''' Function to allow us to easilly pull the text from the 
        contents of the Book object's attributes and write them to the 
        country in which the book was published's CSV file.'''

        file_name = os.path.join(destination_directory + country + ".csv")

        with open(file_name, "a") as by_country_csv:        
            print(self.author.encode('UTF-8') + " & " + \
                  self.quality.encode('UTF-8') + " & " + \
                  self.target_title.encode('UTF-8') + " & " + \
                  self.target_language.encode('UTF-8') + " & " + \
                  self.translators.encode('UTF-8') + " & " + \
                  self.published_city.encode('UTF-8') + " & " + \
                  self.publisher.encode('UTF-8') + " & " + \

                  self.published_country.encode('UTF-8') + " & " + \
                  self.year.encode('UTF-8') + " & " + \
                  self.pages.encode('UTF-8') + " & " + \
                  self.edition.encode('UTF-8') + " & " + \
                  self.original_title.encode('UTF-8') + " & " + \
                  self.original_languages.encode('UTF-8'), file=by_country_csv)

        by_country_csv.close()

    def __init__(self, book, country):
        ''' Initialize the Book object by feeding it the HTML for its 
        row'''
        self.set_author(book)
        self.set_quality(book)
        self.set_target_title(book)
        self.set_target_language(book)

        self.set_translator_name(book)
        self.set_published_city(book)
        self.set_publisher(book)
        self.set_published_country(book)

        self.set_year(book)
        self.set_pages(book)
        self.set_edition(book)
        self.set_original_title(book)

        self.set_original_language(book)


def get_all_pages(country,base_url):
    ''' Create a list of URLs to be crawled by adding the ISO_3166-1_alpha-3
    country code to the URL and then iterating through the results every 10
    pages. Returns a string.'''

    base_page = urllib2.urlopen(base_url+country)
    page = BeautifulSoup(base_page, parse_only=only_restable)

    result_number = page.find_all('td',class_="res1",limit=1)
    if not result_number:
        return 0

    str_result_number = str(result_number[0].getText())
    results_total = int(str_result_number.split('/')[1])

    page.decompose()

    return results_total


def build_list(country_code_list, countries):
    '''  Build the list of all the books, and return a list of Book objects
    in case you want to do something with them in something else, ever.'''
    for country in country_code_list:

        print("Processing %s now..." % countries[country])
        results_total = get_all_pages(country, base_url)

        for url in range(results_total):
            if url % 10 == 0 :
                all_books = []  
                target_page = urllib2.urlopen(base_url + country \
                                             +"&fr="+str(url))
                page = BeautifulSoup(target_page, parse_only=only_restable)
                books = page.find_all('td',class_="res2")
                for book in books:
                    all_books.append(Book (book,country))
                page.decompose()

                for title in all_books:
                    title.export(country)    
    return

if __name__ == "__main__":
    build_list(country_code_list,countries)
    print("Completed.")

解决方案

I guess I'll just list off some of the problems or possible improvements in no particular order:

  1. Follow PEP 8.

    Right now, you've got lots of variables and functions named using camel-case like setAuthor. That's not the conventional style for Python; Python would typically named that set_author (and published_country rather than PublishedCountry, etc.). You can even change the names of some of the things you're calling: for one, BeautifulSoup supports findAll for compatibility, but find_all is recommended.

    Besides naming, PEP 8 also specifies a few other things; for example, you'd want to rewrite this:

    if len(resultNumber) == 0 : return 0
    

    as this:

    if len(result_number) == 0:
        return 0
    

    or even taking into account the fact that empty lists are falsy:

    if not result_number:
        return 0
    

  2. Pass a SoupStrainer to BeautifulSoup.

    The information you're looking for is probably in only part of the document; you don't need to parse the whole thing into a tree. Pass a SoupStrainer as the parse_only argument to BeautifulSoup. This should reduce memory usage by discarding unnecessary parts early.

  3. decompose the soup when you're done with it.

    Python primarily uses reference counting, so removing all circular references (as decompose does) should let its primary mechanism for garbage collection, reference counting, free up a lot of memory. Python also has a semi-traditional garbage collector to deal with circular references, but reference counting is much faster.

  4. Don't make Book.__init__ write things to disk.

    In most cases, I wouldn't expect just creating an instance of a class to write something to disk. Remove the call to export; let the user call export if they want it to be put on the disk.

  5. Stop holding on to so much data in memory.

    You're accumulating all this data into a dictionary just to export it afterwards. The obvious thing to do to reduce memory is to dump it to disk as soon as possible. Your comment indicates that you're putting it in a dictionary to be flexible; but that doesn't mean you have to collect it all in a list: use a generator, yielding items as you scrape them. Then the user can iterate over it just like a list:

    for book in scrape_books():
        book.export()
    

    …but with the advantage that at most one book will be kept in memory at a time.

  6. Use the functions in os.path rather than munging paths yourself.

    Your code right now is rather fragile when it comes to path names. If I accidentally removed the trailing slash from destinationDirectory, something unintended happens. Using os.path.join prevents that from happening and deals with cross-platform differences:

    >>> os.path.join("/Users/robbie/Test/", "USA")
    '/Users/robbie/Test/USA'
    >>> os.path.join("/Users/robbie/Test", "USA")  # still works!
    '/Users/robbie/Test/USA'
    >>> # or say we were on Windows:
    >>> os.path.join(r"C:\Documents and Settings\robbie\Test", "USA")
    'C:\\Documents and Settings\\robbie\\Test\\USA'
    

  7. Abbreviate attrs={"class":...} to class_=....

    BeautifulSoup 4.1.2 introduces searching with class_, which removes the need for the verbose attrs={"class":...}.

I imagine there are even more things you can change, but that's quite a few to start with.

这篇关于减少 Python 脚本中的 RAM 使用量的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆