转:内存使用量过大,内存泄漏 [英] Go: Excessive memory usage, memory leak

查看:145
本文介绍了转:内存使用量过大,内存泄漏的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述



目前我的应用程序很快就达到了32GB的内存,开始交换,然后被系统杀死。



我不明白这是怎么回事,因为所有变量都是可收集的(在函数中并且很快发布),除了 Trainer 结构中的> TokensStruct TokensCount TokensCount 只是一个uint。 TokensStruct 是[5] uint32和string的1,000,000行片段,所以这意味着20个字节+字符串,我们可以称每个记录最多为50个字节。 50 * 1000000 =需要50MB的内存。因此,这个脚本在函数中不应该使用超过50MB +开销+临时可收集变量(可能最大为50MB)。 TokensStruct 的最大可能大小为5,000,000这是 dictionary 的大小,但即使如此,它也只有250MB的内存。 dictionary 是一张地图,显然使用大约600MB内存,因为这是应用程序的启动方式,但这不是问题,因为 dictionary 只加载一次,永远不会再写入。



相反,它使用32GB内存,然后死亡。按照它的速度,如果可以的话,我希望它能够愉快地达到1TB的内存。内存似乎随着被加载文件的大小以线性方式增加,这意味着它似乎永远不会清除任何内存。



我试着执行 runtime.GC()以防垃圾收集不够频繁,但这没有什么区别。



由于内存使用量以线性方式增加,所以这意味着存在在 GetTokens() LoadZip()中发生内存泄漏。我不知道这可能是什么,因为它们都是功能,只做一项任务,然后关闭。或者可能是 Start()中的标记变量是导致泄漏的原因。基本上看起来每个加载和解析的文件都不会从内存中释放,因为这是内存以线性方式填充并持续上升到32GB ++的唯一方式。



绝对的噩梦! Go有什么问题?有什么方法可以解决这个问题?

  package main 

import(
bytes
code.google.com/p/go.text/transform
code.google.com/p/go.text/unicode/norm
compress / zlib
编码/ gob
fmt
github.com/AlasdairF/BinSearch
io / ioutil
os
regexp
运行时
字符串
unicode
unicode / utf8


类型TokensStruct结构{
binsearch.Key_string
Value [] [5] uint32
}

类型培训师结构{
令牌TokensStruct
TokensCount uint
}

func checkErr(err error){
if err == nil {
return
}
fmt.Println(`Some Error:`,err)
panic(err)
}

//用于UTF8字符串标准化的本地帮助函数。
func isMn(r rune)bool {
return unicode.Is(unicode.Mn,r)// Mn:nonspacing marks
}

//此映射由RemoveAccents函数用于转换非重音字符。
var transliterations = map [rune] string {'Æ':E,'Ð':D,'Ł':L,'Ø':OE,'Þ': Th,ß:ss,æ:e,ð:d,ł:l,ø:oe,þ:th ,'Œ':oe,'œ':oe}

// removeAccentsBytes从[]字节将带重音的UTF8字符转换为非重音对应字符。
func removeAccentsBytesDashes(b [] byte)([] byte,error){
mnBuf:= make([] byte,len(b))
t:= transform.Chain(norm。 NFD,transform.RemoveFunc(isMn),norm.NFC)
n,_,err:= t.Transform(mnBuf,b,true)
if err!= nil {
return nil,

mnBuf = mnBuf [:n]
tlBuf:= bytes.NewBuffer(make([] byte,0,len(mnBuf)* 2))
for i ,w:= 0,0;我< LEN(mnBuf); i + = w {
r,width:= utf8.DecodeRune(mnBuf [i:])
if r ==' - '{
tlBuf.WriteByte('')
} else {
if d,ok:= transliterations [r]; OK {
tlBuf.WriteString(d)
} else {
tlBuf.WriteRune(r)
}
}
w = width
}
return tlBuf.Bytes(),nil
}

func LoadZip(filename string)([] byte,error){
//打开文件读取
fi,err:= os.Open(filename)
if err!= nil {
return nil,err
}
defer fi.Close()
//附加ZIP阅读器
fz,err:= zlib.NewReader(fi)
if err!= nil {
return nil,err
}
推迟fz .Close()
//拉取
数据,err:= ioutil.ReadAll(fz)
if err!= nil {
return nil,err
}
返回norm.NFC.Bytes(data),nil //返回标准化的
}

func getTokens(pibn字符串)[]字符串{
var data [] byte
var err error
data,err = LoadZip(`/ storedir /`+ pibn +`/ text.zip`)
checkErr(err)
data,er r = removeAccentsBytesDashes(data)
checkErr(err)
data = bytes.ToLower(data)
data = reg2.ReplaceAll(data,[] byte($ 2))// remove收缩
data = reg.ReplaceAllLiteral(data,nil)
tokens:= strings.Fields(string(data))
返回标记
}

func(t * Trainer)Start(){
data,err:= ioutil.ReadFile(`list.txt`)
checkErr(err)
pibns:= bytes.Fields(data)
for i,pibn:= range pibns {
tokens:= getTokens(string(pibn))
t.addToken(tokens)
if i%100 == 0 {
runtime.GC()//我添加这个只是为了试图阻止内存疯狂,但它没有区别
}
}
}

func(t * Trainer)addTokens(tokens [] string){
for _,tok:= range tokens {
if _,ok:= dictionary [tok]; OK {
if indx,ok2:= t.Tokens.Find(tok); ok2 {
ar:= t.Tokens.Value [indx]
ar [0] ++
t.Tokens.Value [indx] = ar
t.TokensCount ++
} else {
t.Tokens.AddKeyAt(tok,indx)
t.Tokens.Value = append(t.Tokens.Value,[5] uint32 {0,0,0,0, 0})
copy(t.Tokens.Value [indx + 1:],t.Tokens.Value [indx:])
t.Tokens.Value [indx] = [5] uint32 {1 ,0,0,0,0}
t.TokensCount ++
}
}
}
返回
}

func LoadDictionary(){
dictionary = make(map [string] bool)
data,err:= ioutil.ReadFile(`dictionary`)
checkErr(err)
words:= bytes.Fields(data)
for _,word:= range words {
strword:= string(word)
dictionary [strword] = false
}
}

var reg = regexp.MustCompile(`[^ a-z0-9 \s]`)
var reg2 = regexp.MustCompile(` \\b(C | L |全部|多尔|戴尔|内尔|萨尔| COLL |佩尔| GL | AGL | DAGL | degl | negl | sugl |未| M | T | S | V | d |曲| N |百灵)'([az])`)//收缩
var dictionary map [string] bool

func main(){
trainer:= new(Trainer)
LoadDictionary()
trainer.Start()
}


解决方案

确保如果您从大字符串中标记,以避免内存锁定。从上面的注释中,听起来像这些标记是一个大字符串的子字符串。



您可能需要在getTokens()函数中添加一些额外内容,令牌不会占用内存。

  func getTokens(...){
//接近您的结尾程序
for i,t:= range(tokens){
tokens [i] = string([] byte(t))
}
}

顺便说一下,使用 ioutil.ReadFile 全都看起来可疑。您确定您无法使用 bufio.Scanner 吗?



我正在更仔细地查看代码......如果您真的关心内存,请利用 io.Reader 。您应该尽量避免一次吸入整个文件的内容。沿着谷物使用io.Reader和转换。你现在使用它的方式违背了它的意图。您正在使用的转换包的全部要点是构建灵活的读取器,可以通过数据进行流式传输。

例如,下面简单介绍一下你在做什么:

  package main 

import(
bufio
bytes
fmt
unicode / utf8

code.google.com/p/go.text/transform


type AccentsTransformer map [字符串]

func(a AccentsTransformer)转换(dst,src [] byte,atEOF bool)(nDst,nSrc int,err错误){
for nSrc < len(src){
//如果我们处于边缘,请注意并返回。
if!atEOF&& !utf8.FullRune(src [nSrc:]){
err = transform.ErrShortSrc
return
}
r,width:= utf8.DecodeRune(src [nSrc:])
if r == utf8.RuneError&&宽度== 1 {
err = fmt.Errorf(解码错误)
返回
}
如果d,ok:= a [r]; ok {
if nDst + len(d)> len(dst){
err = transform.ErrShortDst
return
}
copy(dst [nDst:],d)
nSrc + = width
nDst + = len(d)
continue
}

if nDst + width> len(dst){
err = transform.ErrShortDst
return
}
copy(dst [nDst:],src [nSrc:nSrc + width])
nDst + =宽度
nSrc + =宽度
}
返回
}

func main(){
transliterations:= AccentsTransformer {'Æ ':E,'Ø':OE}
testString:=cØØlbesses
b:= transform.NewReader(bytes.NewBufferString(testString),transliterations)
scanner: = bufio.NewScanner(b)
scanner.Split(bufio.ScanWords)
for scanner.Scan(){
fmt.Println(token:,scanner.Text())


将变压器链接在一起变得非常简单。因此,例如,如果我们想要从输入流中删除所有连字符,则只需使用 transform.Chain 适当:

  func main(){
音译:= AccentsTransformer {'Æ':E,'Ø':OE}
removeHyphens:= transform.RemoveFunc(func(r rune)bool {
return r ==' - '
})
allTransforms:= transform.Chain(音译,removeHyphens)

testString:=cØØlbeśns - 下一代
b:= transform.NewReader( bytes.NewBufferString(testString),allTransforms)
scanner:= bufio.NewScanner(b)
scanner.Split(bufio.ScanWords)
for scanner.Scan(){
fmt .Println(token:,scanner.Text())
}
}

我没有详尽地测试过上面的代码,所以请不要仅仅复制粘贴它就够了nt测试。 :P我只是把它煮得很快。但是这种方法---避免整个文件的读取---将会更好地扩展,因为它会以块的形式读取文件。


I am very, very memory careful as I have to write programs that need to cope with massive datasets.

Currently my application quickly reaches 32GB of memory, starts swapping, and then gets killed by the system.

I do not understand how this can be since all variables are collectable (in functions and quickly released) except TokensStruct and TokensCount in the Trainer struct. TokensCount is just a uint. TokensStruct is a 1,000,000 row slice of [5]uint32 and string, so that means 20 bytes + string, which we could call a maximum of 50 bytes per record. 50*1000000 = 50MB of memory required. So this script should therefore not use much more than 50MB + overhead + temporary collectable variables in the functions (maybe another 50MB max.) The maximum potential size of TokensStruct is 5,000,000, as this is the size of dictionary, but even then it would be only 250MB of memory. dictionary is a map and apparently uses around 600MB of memory, as that is how the app starts, but this is not an issue because dictionary is only loaded once and never written to again.

Instead it uses 32GB of memory then dies. By the speed that it does this I expect it would happily get to 1TB of memory if it could. The memory appears to increase in a linear fashion with the size of the files being loaded, meaning that it appears to never clear any memory at all. Everything that enters the app is allocated more memory and memory is never freed.

I tried implementing runtime.GC() in case the garbage collection wasn't running often enough, but this made no difference.

Since the memory usage increases in a linear fashion then this would imply that there is a memory leak in GetTokens() or LoadZip(). I don't know how this could be, since they are both functions and only do one task and then close. Or it could be that the tokens variable in Start() is the cause of the leak. Basically it looks like every file that is loaded and parsed is never released from memory, as that is the only way that the memory could fill up in a linear fashion and keep on rising up to 32GB++.

Absolute nightmare! What's wrong with Go? Any way to fix this?

package main

import (
    "bytes"
    "code.google.com/p/go.text/transform"
    "code.google.com/p/go.text/unicode/norm"
    "compress/zlib"
    "encoding/gob"
    "fmt"
    "github.com/AlasdairF/BinSearch"
    "io/ioutil"
    "os"
    "regexp"
    "runtime"
    "strings"
    "unicode"
    "unicode/utf8"
)

type TokensStruct struct {
    binsearch.Key_string
    Value [][5]uint32
}

type Trainer struct {
    Tokens      TokensStruct
    TokensCount uint
}

func checkErr(err error) {
    if err == nil {
        return
    }
    fmt.Println(`Some Error:`, err)
    panic(err)
}

// Local helper function for normalization of UTF8 strings.
func isMn(r rune) bool {
    return unicode.Is(unicode.Mn, r) // Mn: nonspacing marks
}

// This map is used by RemoveAccents function to convert non-accented characters.
var transliterations = map[rune]string{'Æ': "E", 'Ð': "D", 'Ł': "L", 'Ø': "OE", 'Þ': "Th", 'ß': "ss", 'æ': "e", 'ð': "d", 'ł': "l", 'ø': "oe", 'þ': "th", 'Œ': "OE", 'œ': "oe"}

//  removeAccentsBytes converts accented UTF8 characters into their non-accented equivalents, from a []byte.
func removeAccentsBytesDashes(b []byte) ([]byte, error) {
    mnBuf := make([]byte, len(b))
    t := transform.Chain(norm.NFD, transform.RemoveFunc(isMn), norm.NFC)
    n, _, err := t.Transform(mnBuf, b, true)
    if err != nil {
        return nil, err
    }
    mnBuf = mnBuf[:n]
    tlBuf := bytes.NewBuffer(make([]byte, 0, len(mnBuf)*2))
    for i, w := 0, 0; i < len(mnBuf); i += w {
        r, width := utf8.DecodeRune(mnBuf[i:])
        if r == '-' {
            tlBuf.WriteByte(' ')
        } else {
            if d, ok := transliterations[r]; ok {
                tlBuf.WriteString(d)
            } else {
                tlBuf.WriteRune(r)
            }
        }
        w = width
    }
    return tlBuf.Bytes(), nil
}

func LoadZip(filename string) ([]byte, error) {
    // Open file for reading
    fi, err := os.Open(filename)
    if err != nil {
        return nil, err
    }
    defer fi.Close()
    // Attach ZIP reader
    fz, err := zlib.NewReader(fi)
    if err != nil {
        return nil, err
    }
    defer fz.Close()
    // Pull
    data, err := ioutil.ReadAll(fz)
    if err != nil {
        return nil, err
    }
    return norm.NFC.Bytes(data), nil // return normalized
}

func getTokens(pibn string) []string {
    var data []byte
    var err error
    data, err = LoadZip(`/storedir/` + pibn + `/text.zip`)
    checkErr(err)
    data, err = removeAccentsBytesDashes(data)
    checkErr(err)
    data = bytes.ToLower(data)
    data = reg2.ReplaceAll(data, []byte("$2")) // remove contractions
    data = reg.ReplaceAllLiteral(data, nil)
    tokens := strings.Fields(string(data))
    return tokens
}

func (t *Trainer) Start() {
    data, err := ioutil.ReadFile(`list.txt`)
    checkErr(err)
    pibns := bytes.Fields(data)
    for i, pibn := range pibns {
        tokens := getTokens(string(pibn))
        t.addTokens(tokens)
        if i%100 == 0 {
            runtime.GC() // I added this just to try to stop the memory craziness, but it makes no difference
        }
    }
}

func (t *Trainer) addTokens(tokens []string) {
    for _, tok := range tokens {
        if _, ok := dictionary[tok]; ok {
            if indx, ok2 := t.Tokens.Find(tok); ok2 {
                ar := t.Tokens.Value[indx]
                ar[0]++
                t.Tokens.Value[indx] = ar
                t.TokensCount++
            } else {
                t.Tokens.AddKeyAt(tok, indx)
                t.Tokens.Value = append(t.Tokens.Value, [5]uint32{0, 0, 0, 0, 0})
                copy(t.Tokens.Value[indx+1:], t.Tokens.Value[indx:])
                t.Tokens.Value[indx] = [5]uint32{1, 0, 0, 0, 0}
                t.TokensCount++
            }
        }
    }
    return
}

func LoadDictionary() {
    dictionary = make(map[string]bool)
    data, err := ioutil.ReadFile(`dictionary`)
    checkErr(err)
    words := bytes.Fields(data)
    for _, word := range words {
        strword := string(word)
        dictionary[strword] = false
    }
}

var reg = regexp.MustCompile(`[^a-z0-9\s]`)
var reg2 = regexp.MustCompile(`\b(c|l|all|dall|dell|nell|sull|coll|pell|gl|agl|dagl|degl|negl|sugl|un|m|t|s|v|d|qu|n|j)'([a-z])`) //contractions
var dictionary map[string]bool

func main() {
    trainer := new(Trainer)
    LoadDictionary()
    trainer.Start()
}

解决方案

Make sure that if you're tokenizing from a large string, to avoid memory pinning. From the comments above, it sounds like the tokens are substrings of a large string.

You may need to add a little extra in your getTokens() function so it guarantees the tokens aren't pinning memory.

func getTokens(...) {
    // near the end of your program
    for i, t := range(tokens) {
        tokens[i] = string([]byte(t))
    }
}

By the way, reading the whole file into memory using ioutil.ReadFile all at once looks dubious. Are you sure you can't use bufio.Scanner?

I'm looking at the code more closely... if you are truly concerned about memory, take advantage of io.Reader. You should try to avoid sucking in the content of a whole file at once. Use io.Reader and the transform "along the grain". The way you're using it now is against the grain of its intent. The whole point of the transform package you're using is to construct flexible Readers that can stream through data.

For example, here's a simplification of what you're doing:

package main

import (
    "bufio"
    "bytes"
    "fmt"
    "unicode/utf8"

    "code.google.com/p/go.text/transform"
)

type AccentsTransformer map[rune]string

func (a AccentsTransformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
    for nSrc < len(src) {
        // If we're at the edge, note this and return.
        if !atEOF && !utf8.FullRune(src[nSrc:]) {
            err = transform.ErrShortSrc
            return
        }
        r, width := utf8.DecodeRune(src[nSrc:])
        if r == utf8.RuneError && width == 1 {
            err = fmt.Errorf("Decoding error")
            return
        }
        if d, ok := a[r]; ok {
            if nDst+len(d) > len(dst) {
                err = transform.ErrShortDst
                return
            }
            copy(dst[nDst:], d)
            nSrc += width
            nDst += len(d)
            continue
        }

        if nDst+width > len(dst) {
            err = transform.ErrShortDst
            return
        }
        copy(dst[nDst:], src[nSrc:nSrc+width])
        nDst += width
        nSrc += width
    }
    return
}

func main() {
    transliterations := AccentsTransformer{'Æ': "E", 'Ø': "OE"}
    testString := "cØØl beÆns"
    b := transform.NewReader(bytes.NewBufferString(testString), transliterations)
    scanner := bufio.NewScanner(b)
    scanner.Split(bufio.ScanWords)
    for scanner.Scan() {
        fmt.Println("token:", scanner.Text())
    }
}

It becomes really easy then to chain transformers together. So, for example, if we wanted to remove all hyphens from the input stream, it's just a matter of using transform.Chain appropriately:

func main() {
    transliterations := AccentsTransformer{'Æ': "E", 'Ø': "OE"}
    removeHyphens := transform.RemoveFunc(func(r rune) bool {
        return r == '-'
    })
    allTransforms := transform.Chain(transliterations, removeHyphens)

    testString := "cØØl beÆns - the next generation"
    b := transform.NewReader(bytes.NewBufferString(testString), allTransforms)
    scanner := bufio.NewScanner(b)
    scanner.Split(bufio.ScanWords)
    for scanner.Scan() {
        fmt.Println("token:", scanner.Text())
    }
}

I have not exhaustively tested the code above, so please don't just copy-and-paste it without sufficient tests. :P I just cooked it up fast. But this kind of approach --- avoiding whole-file reading --- will scale better because it will read the file in chunks.

这篇关于转:内存使用量过大,内存泄漏的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆