如何将大 pandas DataFrame子类化? [英] How to subclass pandas DataFrame?
问题描述
有一些SO线程的主题,但我希望这里的人可以提供一个更系统的帐户,目前最好的方式子类化pandas.DataFrame满足两个,我想,一般要求:
import numpy as np
import pandas as pd
类MyDF(pd.DataFrame):
#如何将大熊猫DataFrame子类化?
pass
mydf = MyDF(np.random.randn(3,4),columns = ['A','B','C','D'])
打印类型(mydf)#< class'__main __。MyDF'>
#要求1:MyDF的实例在调用DataFrame的标准方法时,
#应生成MyDF的实例。
mydf_sub = mydf [['A','C']]
打印类型(mydf_sub)#< class'pandas.core.frame.DataFrame'>
#要求2:附加到MyDF实例的属性,当调用StandardFlame的标准
#方法时,仍应附加到输出。
mydf.myattr = 1
mydf_cp1 = MyDF(mydf)
mydf_cp2 = mydf.copy()
打印hasattr(mydf_cp1,'myattr')#False
打印hasattr(mydf_cp2,'myattr')#False
还有对于泛型pandas.Series有什么显着的区别?谢谢。
对于需求1,只需定义 _constructor
:
import pandas as pd
import numpy as np
class MyDF(pd.DataFrame) :
@property
def _constructor(self):
return MyDF
mydf = MyDF(np.random.randn(3,4) column = ['A','B','C','D'])
打印类型(mydf)
mydf_sub = mydf [['A','C' ]
打印类型(mydf_sub)
我认为要求2没有简单的解决方案,我想你需要定义 __ init __
, copy
,或在 _constructor
,例如:
import pandas as pd
import numpy as np
class MyDF(pd.DataFrame):
_attributes_ =myattr1,myattr2
def __init __(self,* args,** kw):
super(MyDF,self ).__ init __(* args,** kw)
如果len(args)== 1和isinst ance(args [0],MyDF):
args [0] ._ copy_attrs(self)
def _copy_attrs(self,df):
for attr in self._attributes_。 split(,):
df .__ dict __ [attr] = getattr(self,attr,None)
@property
def _constructor(self):
def f(* args,** kw):
df = MyDF(* args,** kw)
self._copy_attrs(df)
return df
return f
mydf = MyDF(np.random.randn(3,4),columns = ['A','B','C','D'])
打印类型(mydf)
mydf_sub = mydf [['A','C']]
打印类型(mydf_sub)
mydf.myattr1 = 1
mydf_cp1 = MyDF(mydf)
mydf_cp2 = mydf.copy()
打印mydf_cp1.myattr1,mydf_cp2.myattr1
Subclassing pandas classes seems a common need but I could not find references on the subject. (It seems that pandas developers are still working on it: https://github.com/pydata/pandas/issues/60).
There are some SO threads on the subject, but I am hoping that someone here can provide a more systematic account on currently the best way to subclass pandas.DataFrame that satisfies two, I think, general requirements:
import numpy as np
import pandas as pd
class MyDF(pd.DataFrame):
# how to subclass pandas DataFrame?
pass
mydf = MyDF(np.random.randn(3,4), columns=['A','B','C','D'])
print type(mydf) # <class '__main__.MyDF'>
# Requirement 1: Instances of MyDF, when calling standard methods of DataFrame,
# should produce instances of MyDF.
mydf_sub = mydf[['A','C']]
print type(mydf_sub) # <class 'pandas.core.frame.DataFrame'>
# Requirement 2: Attributes attached to instances of MyDF, when calling standard
# methods of DataFrame, should still attach to the output.
mydf.myattr = 1
mydf_cp1 = MyDF(mydf)
mydf_cp2 = mydf.copy()
print hasattr(mydf_cp1, 'myattr') # False
print hasattr(mydf_cp2, 'myattr') # False
And is there any significant differences for subclassing pandas.Series? Thank you.
For Requirement 1, just define _constructor
:
import pandas as pd
import numpy as np
class MyDF(pd.DataFrame):
@property
def _constructor(self):
return MyDF
mydf = MyDF(np.random.randn(3,4), columns=['A','B','C','D'])
print type(mydf)
mydf_sub = mydf[['A','C']]
print type(mydf_sub)
I think there is no simple solution for Requirement 2, I think you need define __init__
, copy
, or do something in _constructor
, for example:
import pandas as pd
import numpy as np
class MyDF(pd.DataFrame):
_attributes_ = "myattr1,myattr2"
def __init__(self, *args, **kw):
super(MyDF, self).__init__(*args, **kw)
if len(args) == 1 and isinstance(args[0], MyDF):
args[0]._copy_attrs(self)
def _copy_attrs(self, df):
for attr in self._attributes_.split(","):
df.__dict__[attr] = getattr(self, attr, None)
@property
def _constructor(self):
def f(*args, **kw):
df = MyDF(*args, **kw)
self._copy_attrs(df)
return df
return f
mydf = MyDF(np.random.randn(3,4), columns=['A','B','C','D'])
print type(mydf)
mydf_sub = mydf[['A','C']]
print type(mydf_sub)
mydf.myattr1 = 1
mydf_cp1 = MyDF(mydf)
mydf_cp2 = mydf.copy()
print mydf_cp1.myattr1, mydf_cp2.myattr1
这篇关于如何将大 pandas DataFrame子类化?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!