如何在python中压缩非常大的文件 [英] How to zip a very large file in python

查看:655
本文介绍了如何在python中压缩非常大的文件的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我想使用python压缩几个可能达到约99 GB的文件。请使用zipfile库进行此操作的最有效方法是什么?这是我使用gcs.open(zip_file_name,'w',content_type = b'application / zip')的示例代码

  )as f:

with zipfile.ZipFile(f,'w')as z:

文件中的文件:

is_owner =( is_page_allowed_to_visitor(page,visitor)或(file.owner_id == visitor.id))

如果is_owner:
file.show = True
elif file.available_from:
if file.available_from> datetime.now():
file.show = False
elif file.available_to:
if file.available_to< datetime.now():
file.show = False
else:
file.show = True

if file.show:

file_name =/%s /%s%(gcs_store.get_bucket_name(),file.gcs_name)

gcs_reader = gcs.open(file_name,'r')

z.writestr('%s-%s'%(file.created_on,file.name),gcs_reader.read())

gcs_reader.close()

f.close()#closing zip文件

需要注意的几点:



1)我使用谷歌应用引擎来托管这些文件,所以我不能使用zipfile.write()方法。我只能以字节获得文件内容。



预先致谢

我已经为 zipfile 库添加了一个新方法。这个增强的zipfile库是开源的,可以在github上找到( EnhancedZipFile )。我从 zipfile.write()方法和 zipfile.writestr()方法中添加了一个新方法

  def writebuffered(zinfo_or_arcname,file_pointer,file_size,compress_type = None) ZipInfo):
zinfo = ZipInfo(filename = zinfo_or_arcname,
date_time = time.localtime(time.time())[:6])

zinfo.compress_type = self。如果zinfo.filename [-1] =='/'则压缩

zinfo.external_attr = 0o40775<< 16#drwxrwxr-x
zinfo.external_attr | = 0x10#MS-DOS目录标志
else:
zinfo.external_attr = 0o600<< 16#?rw -------
else:
zinfo = zinfo_or_arcname

zinfo.file_size = file_size#未压缩大小
zinfo.header_offset = self。 fp.tell()#头字节开头
self._writecheck(zinfo)
self._didModify = True

fp = file_pointer
#必须覆盖CRC和大小以后有正确的数据
zinfo.CRC = CRC = 0
zinfo.compress_size = compress_size = 0
#压缩大小可以大于未压缩大小
zip64 = self._allowZip64和\\ \\
zinfo.file_size * 1.05> ZIP64_LIMIT
self.fp.write(zinfo.FileHeader(zip64))
如果zinfo.compress_type == ZIP_DEFLATED:
cmpr = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION,
zlib。 DEFLATED,-15)
else:
cmpr = None
file_size = 0
while 1:
buf = fp.read(1024 * 8)
如果不是buf:
break
file_size = file_size + len(buf)
CRC = crc32(buf,CRC)& 0xffffffff
如果cmpr:
buf = cmpr.compress(buf)
compress_size = compress_size + len(buf)
self.fp.write(buf)

if cmpr:
buf = cmpr.flush()
compress_size = compress_size + len(buf)
self.fp.write(buf)
zinfo.compress_size = compress_size
其他:
zinfo.compress_size = file_size
zinfo.CRC = CRC
zinfo.file_size = file_size
如果不是zip64和self._allowZip64:
if file_size > ZIP64_LIMIT:
raise RuntimeError('压缩时文件大小增加')
如果压缩大小>> ZIP64_LIMIT:
提高RuntimeError('压缩大小大于未压缩大小')
#向后寻找并写入文件头(现在包含
#正确的CRC和文件大小)
位置= self.fp.tell()#保留文件
中的当前位置self.fp.flush()
self.filelist.append(zinfo)
self.NameToInfo [zinfo.filename] = zinfo

注意事项




  • 我是python的新手,所以我上面写的代码可能不是非常优化的。

  • 请在这里贡献github上的项目< a href =https://github.com/najela/EnhancedZipFile =nofollow> https://github.com/najela/EnhancedZipFile


I would like to zip a couple of files that may amount to about 99 GB using python. Please what is the most efficient way to do this using the zipfile library. This is a sample code I have

with gcs.open(zip_file_name, 'w', content_type=b'application/zip') as f:

    with  zipfile.ZipFile(f, 'w') as z:

        for file in files:

            is_owner = (is_page_allowed_to_visitor(page, visitor) or (file.owner_id == visitor.id) )

            if is_owner:
                file.show = True
            elif file.available_from:
                if file.available_from > datetime.now():
                    file.show = False
            elif file.available_to:
                if file.available_to < datetime.now():
                    file.show = False
            else:
                file.show = True

            if file.show:

                file_name = "/%s/%s" % (gcs_store.get_bucket_name(), file.gcs_name)

                gcs_reader = gcs.open(file_name, 'r')

                z.writestr('%s-%s' %(file.created_on, file.name), gcs_reader.read() )

                gcs_reader.close()

f.close() #closing zip file

Some points to note:

1) I am using the google app engine to host the files so I cannot use the zipfile.write() method. I can only get the file contents in bytes.

Thanks in advance

解决方案

I have added a new method to the zipfile library. This enhanced zipfile library is open source and can be found on github (EnhancedZipFile). I added a new method with the inspiration from the zipfile.write() method and the zipfile.writestr()method

def writebuffered(self, zinfo_or_arcname, file_pointer, file_size, compress_type=None):
    if not isinstance(zinfo_or_arcname, ZipInfo):
        zinfo = ZipInfo(filename=zinfo_or_arcname,
                        date_time=time.localtime(time.time())[:6])

        zinfo.compress_type = self.compression
        if zinfo.filename[-1] == '/':
            zinfo.external_attr = 0o40775 << 16   # drwxrwxr-x
            zinfo.external_attr |= 0x10           # MS-DOS directory flag
        else:
            zinfo.external_attr = 0o600 << 16     # ?rw-------
    else:
        zinfo = zinfo_or_arcname

    zinfo.file_size = file_size            # Uncompressed size
    zinfo.header_offset = self.fp.tell()    # Start of header bytes
    self._writecheck(zinfo)
    self._didModify = True

    fp = file_pointer
    # Must overwrite CRC and sizes with correct data later
    zinfo.CRC = CRC = 0
    zinfo.compress_size = compress_size = 0
    # Compressed size can be larger than uncompressed size
    zip64 = self._allowZip64 and \
            zinfo.file_size * 1.05 > ZIP64_LIMIT
    self.fp.write(zinfo.FileHeader(zip64))
    if zinfo.compress_type == ZIP_DEFLATED:
        cmpr = zlib.compressobj(zlib.Z_DEFAULT_COMPRESSION,
             zlib.DEFLATED, -15)
    else:
        cmpr = None
    file_size = 0
    while 1:
        buf = fp.read(1024 * 8)
        if not buf:
            break
        file_size = file_size + len(buf)
        CRC = crc32(buf, CRC) & 0xffffffff
        if cmpr:
            buf = cmpr.compress(buf)
            compress_size = compress_size + len(buf)
        self.fp.write(buf)

    if cmpr:
        buf = cmpr.flush()
        compress_size = compress_size + len(buf)
        self.fp.write(buf)
        zinfo.compress_size = compress_size
    else:
        zinfo.compress_size = file_size
    zinfo.CRC = CRC
    zinfo.file_size = file_size
    if not zip64 and self._allowZip64:
        if file_size > ZIP64_LIMIT:
            raise RuntimeError('File size has increased during compressing')
        if compress_size > ZIP64_LIMIT:
            raise RuntimeError('Compressed size larger than uncompressed size')
    # Seek backwards and write file header (which will now include
    # correct CRC and file sizes)
    position = self.fp.tell()       # Preserve current position in file
    self.fp.flush()
    self.filelist.append(zinfo)
    self.NameToInfo[zinfo.filename] = zinfo

Points to note

这篇关于如何在python中压缩非常大的文件的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆