从大型结构化文本文件中提取信息 [英] Extracting info from large structured text files
问题描述
我需要读取一些大文件(从 50k 到 100k 行),这些文件以空行分隔的组结构.每组从相同的模式No.999999999 dd/mm/yyyy ZZZ"开始.这是一些示例数据.
I need to read some large files (from 50k to 100k lines), structured in groups separated by empty lines. Each group start at the same pattern "No.999999999 dd/mm/yyyy ZZZ". Here´s some sample data.
No.813829461 16/09/1987 270
Tit.SUZANO PAPEL E CELULOSE S.A. (BR/BA)
C.N.P.J./C.I.C./N INPI : 16404287000155
检察官:马塞洛·杜·纳西门托
No.813829461 16/09/1987 270
Tit.SUZANO PAPEL E CELULOSE S.A. (BR/BA)
C.N.P.J./C.I.C./N INPI : 16404287000155
Procurador: MARCELLO DO NASCIMENTO
No.815326777 28/12/1989 351
Tit.SIGLA SISTEMA GLOBO DE GRAVACOES AUDIO VISUAIS LTDA (BR/RJ)
C.N.P.J./C.I.C./NºINPI : 34162651000108
Apres.: Nominativa ;国家:De Produto
马卡:热带三重奏
Clas.Prod/Serv: 09.40
*DEFERIDO CONFORME RESOLUÇÃO 123 DE 06/01/2006,PUBLICADA NA RPI 1829,DE 24/01/2006.
检察官:瓦尔德马尔·罗德里格斯·佩德拉
No.815326777 28/12/1989 351
Tit.SIGLA SISTEMA GLOBO DE GRAVACOES AUDIO VISUAIS LTDA (BR/RJ)
C.N.P.J./C.I.C./NºINPI : 34162651000108
Apres.: Nominativa ; Nat.: De Produto
Marca: TRIO TROPICAL
Clas.Prod/Serv: 09.40
*DEFERIDO CONFORME RESOLUÇÃO 123 DE 06/01/2006, PUBLICADA NA RPI 1829, DE 24/01/2006.
Procurador: WALDEMAR RODRIGUES PEDRA
No.900148764 11/01/2007 LD3
Tit.TIARA BOLSAS E CALÇADOS LTDA
检察官:玛西娅·费雷拉·戈麦斯
*Escritório: Marcas Marcantes e Patentes Ltda
* Exigência Formal não respondida Satisfatoriamente, Pedido de Registro de Marca thinking of inexistente, de acordo com Art.157 da LPI
*Protocolo da Petição de cumprimento de Exigência 正式:810080140197
No.900148764 11/01/2007 LD3
Tit.TIARA BOLSAS E CALÇADOS LTDA
Procurador: Marcia Ferreira Gomes
*Escritório: Marcas Marcantes e Patentes Ltda
*Exigência Formal não respondida Satisfatoriamente, Pedido de Registro de Marca considerado inexistente, de acordo com Art. 157 da LPI
*Protocolo da Petição de cumprimento de Exigência Formal: 810080140197
我写了一些代码来相应地解析它.有什么我可以改进的,以提高可读性或性能?这是我到目前为止所做的:
I wrote some code that´s parsing it accordingly. There´s anything that I can improve, to improve readability or performance? Here´s what I come so far:
import re, pprint
class Despacho(object):
"""
Class to parse each line, applying the regexp and storing the results
for future use
"""
regexp = {
re.compile(r'No.([\d]{9}) ([\d]{2}/[\d]{2}/[\d]{4}) (.*)'): lambda self: self._processo,
re.compile(r'Tit.(.*)'): lambda self: self._titular,
re.compile(r'Procurador: (.*)'): lambda self: self._procurador,
re.compile(r'C.N.P.J./C.I.C./N INPI :(.*)'): lambda self: self._documento,
re.compile(r'Apres.: (.*) ; Nat.: (.*)'): lambda self: self._apresentacao,
re.compile(r'Marca: (.*)'): lambda self: self._marca,
re.compile(r'Clas.Prod/Serv: (.*)'): lambda self: self._classe,
re.compile(r'\*(.*)'): lambda self: self._complemento,
}
def __init__(self):
"""
'complemento' is the only field that can be multiple in a single registry
"""
self.complemento = []
def _processo(self, matches):
self.processo, self.data, self.despacho = matches.groups()
def _titular(self, matches):
self.titular = matches.group(1)
def _procurador(self, matches):
self.procurador = matches.group(1)
def _documento(self, matches):
self.documento = matches.group(1)
def _apresentacao(self, matches):
self.apresentacao, self.natureza = matches.groups()
def _marca(self, matches):
self.marca = matches.group(1)
def _classe(self, matches):
self.classe = matches.group(1)
def _complemento(self, matches):
self.complemento.append(matches.group(1))
def read(self, line):
for pattern in Despacho.regexp:
m = pattern.match(line)
if m:
Despacho.regexp[pattern](self)(m)
def process(rpi):
"""
read data and process each group
"""
rpi = (line for line in rpi)
group = False
for line in rpi:
if line.startswith('No.'):
group = True
d = Despacho()
if not line.strip() and group: # empty line - end of block
yield d
group = False
d.read(line)
arquivo = open('rm1972.txt') # file to process
for desp in process(arquivo):
pprint.pprint(desp.__dict__)
print('--------------')
推荐答案
那很好.下面是一些建议,如果你喜欢它们,请告诉我:
That is pretty good. Below some suggestions, let me know if you like'em:
import re
import pprint
import sys
class Despacho(object):
"""
Class to parse each line, applying the regexp and storing the results
for future use
"""
#used a dict with the keys instead of functions.
regexp = {
('processo',
'data',
'despacho'): re.compile(r'No.([\d]{9}) ([\d]{2}/[\d]{2}/[\d]{4}) (.*)'),
('titular',): re.compile(r'Tit.(.*)'),
('procurador',): re.compile(r'Procurador: (.*)'),
('documento',): re.compile(r'C.N.P.J./C.I.C./N INPI :(.*)'),
('apresentacao',
'natureza'): re.compile(r'Apres.: (.*) ; Nat.: (.*)'),
('marca',): re.compile(r'Marca: (.*)'),
('classe',): re.compile(r'Clas.Prod/Serv: (.*)'),
('complemento',): re.compile(r'\*(.*)'),
}
def __init__(self):
"""
'complemento' is the only field that can be multiple in a single registry
"""
self.complemento = []
def read(self, line):
for attrs, pattern in Despacho.regexp.iteritems():
m = pattern.match(line)
if m:
for groupn, attr in enumerate(attrs):
# special case complemento:
if attr == 'complemento':
self.complemento.append(m.group(groupn + 1))
else:
# set the attribute on the object
setattr(self, attr, m.group(groupn + 1))
def __repr__(self):
# defines object printed representation
d = {}
for attrs in self.regexp:
for attr in attrs:
d[attr] = getattr(self, attr, None)
return pprint.pformat(d)
def process(rpi):
"""
read data and process each group
"""
#Useless line, since you're doing a for anyway
#rpi = (line for line in rpi)
group = False
for line in rpi:
if line.startswith('No.'):
group = True
d = Despacho()
if not line.strip() and group: # empty line - end of block
yield d
group = False
d.read(line)
def main():
arquivo = open('rm1972.txt') # file to process
for desp in process(arquivo):
print desp # can print directly here.
print('-' * 20)
return 0
if __name__ == '__main__':
main()
这篇关于从大型结构化文本文件中提取信息的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!