将R data.frame导出到SPSS [英] Export R data.frame to SPSS
问题描述
有一个包含可以写SPS和CSV文件的函数 write.foreign()
的包 foreign
。 SPS文件比可以将CSV字段读入SPSS包括标签。到目前为止,但这个函数有一些问题:
- 较新的SPSS版本可能会显示一个错误,
DATA LIST
- 如果通过
attr $ c>,这些都会丢失。
- 即使SPSS vesion支持高达32767的字符串,函数
write.foreign()
- 如果使用任何字符变量,则为星号(*),但是较新的SPSS版本无法处理。
- CSV文件以逗号分隔,并且不能使用引号,因此字符串中不允许使用逗号。
- 非ASCII字符崩溃导入
- 如果您的字符包含任何NA值,您将看到...
...这样的错误消息:
if(any(lengths> 255L)) stop(不能处理大于255的字符变量):
缺少值,其中TRUE / FALSE需要
我花了很多时间,然后发现了一个很好的帖子( http://r.789695.n4.nabble.com/SPSS-export-in-R-package-foreign-td921491.html )开始并使其更好。
此函数可替代
datafile
指定完整路径(!)(如果使用原始 foreign:write.foreign()$ 注意:此脚本将替换制表符(TAB)和其他间距(包括CR + LF)字符串通过空白而没有警告。可以考虑使用 GET DATA
而不是麻烦的 DATA LIST
来解决这个限制。
注意:可能有一个警告在FUN(X [[i]],...):可能完全丧失模数精度 POSIXlt
和 POSIXct
变量尚未由脚本正确处理。 writeForeignMySPSS = function(df,datafile,codefile,varnames = NULL,len = 32767){
adQuote< - function(x)paste(\,x,\ )
#DATA LIST的最后一个变量不能为空
if(any(is.na(df [[length(df)]]))) {
df $ END_CASE = 0
}
#http://stackoverflow.com/questions/5173692/how-to-return-number-of-decimal-places- in-r
decimalplaces< - function(x){
y = x [!is.na(x)]
if(length(y)== 0){
return(0)
}
if(any((y %% 1)!= 0)){
info = strsplit(sub('0 + $','字符(y)),。,fixed = TRUE)
info = info [sapply(info,FUN = length)== 2]
if(length(info)> = 2){
dec = nchar(unlist(info))[seq(2,length(info),2)]
} else {
return(0)
}
return(max(dec,na.rm = T))
} else {
return(0)
}
}
dfn < lapply(df,function(x)if(is.factor(x))
as.numeric(x)
else x)
#布尔变量b $ b bv = sapply(dfn,is.logical)
for(v其中(bv)){
dfn [[v]] = ifelse(dfn [[v]],1,0 )
}
varlabels< - names(df)
#在适用的地方使用注释
for(i in 1:length(df)){
cm = comment(df [[i]])
if(ischaracter(cm)&& (length(cm)> 0)){
varlabels [i] = comment(df [[i]])
}
}
if .null(varnames)){
varnames< - abbreviate(names(df),8L)
if(any(sapply(varnames,nchar)> 8L))
stop我不能将变量名称缩写为八个或更少的字母)
if(any(varnames!= varlabels))
warning(一些变量名称被缩写)
}
varnames< - gsub([^ [:alnum:] _ \\ $ @#],\\。,varnames)
dl.varnames< - varnames
chv = sapply(df,is.character)
if(any(chv)){
for(v in which(chv)){
dfn [[v]] = gsub \\ s,,dfn [[v]])
}
lengths< - sapply(df [chv],function(v)max(nchar(v),na。 rm = T))
if(any(lengths> len)){
warning(paste(Clipped strings in,names(df [chv]),to,len, ))
for(v in which(chv)){
df [[v]] = substr(df [[v]],start = 1,stop = len)
}
}
长度[is.infinite(长度)] = 0
长度[长度< 1] = 1
lengths< - paste((A,lengths,),sep =)
#star< - ifelse(c(FALSE,diff )> 1)),*,
dl.varnames [chv]< - paste(dl.varnames [chv],lengths)
}
#和bools
nmv = sapply(df,is.numeric)
dbv = sapply(df,is.numeric)
nv =(nmv | dbv)
decimals = sapply [nv],FUN = decimalplaces)
dl.varnames [nv] = paste(dl.varnames [nv],(F,decimals + 8,。,decimals,),sep = )
if(length(bv)> 0){
dl.varnames [bv] = paste(dl.varnames [bv],(F1.0))
}
rmv =!(chv | nv | bv)
if(length(rmv)> 0){
dl.varnames [rmv] = paste(dl.varnames [rmv], (F8.0))
}
#输出中断
brv = seq(1,length(dl.varnames),10)
dl.varnames [brv] =粘贴(dl.varnames [brv],\\\
,sep =)
cat(SET LOCALE = ENGLISH.\\\
,file = codefile)
cat (DATA LIST FILE =,adQuote(datafile),free(TAB)\\\
,file = codefile,append = TRUE)
cat(/,dl.varnames,。 \ n,file = codefile,append = TRUE)
cat(VARIABLE LABELS\ n,file = codefile,append = TRUE)
cat(粘贴(varnames,adQuote(varlabels) \ n),.\\\
,file = codefile,
append = TRUE)
因子< - sapply(df,is.factor)
if因此){
cat(\\\
VALUE LABELS\\\
,file = codefile,append = TRUE)
for(v其中\ n,file = codefile,append = TRUE)
cat(varnames [v],\\\
,file = codefile,append = TRUE)
levs < [v]])
cat(粘贴(1:length(levs),adQuote(levs),\\\
,sep =),
file = codefile,append = TRUE)
}
cat(。\\\
,file = codefile,append = TRUE)
}
#存储在attr < - !unlist(lapply(sapply(df,FUN = attr,which =1),FUN = is.null))
if(any(attribs)){
cat \\ nVALUE LABELS\\\
,file = codefile,append = TRUE)
for(v其中(attribs)){
cat(/ \\\
,file = codefile,append = TRUE )
cat(varnames [v],\\\
,file = codefile,append = TRUE)
#检查标记值
tc = list()
for in dimnames(table(df [[v]]))[[1]]){
if(!is.null(tcl < - attr(df [[v]],tcv))){
tc [tcv] = tcl
}
}
cat(paste(names(tc),tc,\\\
,sep =),
file = codefile,append = TRUE)
}
cat(。\\\
,file = codefile,append = TRUE)
}
ordinal< - sapply(df,is.ordered)
if(any(ordinal)){
tmp = varnames [ordinal]
brv = seq(1,length(tmp),10)
tmp [brv] =粘贴(tmp [brv],\\\
)
cat(粘贴(\\\
VARIABLE LEVEL,粘贴(tmp,collapse =),(ORDINAL)。 \ n),
file = codefile,append = TRUE)
}
num < - sapply(df,is.numeric)
if {
tmp = varnames [num]
brv = seq(1,length(tmp),10)
tmp [brv] = paste(tmp [brv],\\\
)
cat(粘贴(\\\
VARIABLE LEVEL,粘贴(tmp,collapse =),(SCALE).\\\
),
file = codefile,append = TRUE)
}
cat(\\\
EXECUTE.\\\
,file = codefile,append = TRUE)
write.table(dfn,file = datafile,row = FALSE,col = FALSE,
sep =\t,quote = F,na =,eol =\\\
,fileEncoding =UTF-8)
} / code>
从长远来看,这些更改可能被视为合并到 foreign
package。不幸的是,r项目的错误报告系统目前仅限于以前注册的开发人员。
There is a package foreign
with a function write.foreign()
that can write a SPS and CSV file. The SPS file than can read the CSV fiel into SPSS including labels. Fine so far, but there are some issues with that function:
- Newer SPSS versions may show an error that you have too few format definitions in
DATA LIST
- If there are "labels" for numeric variables stored via
attr()
, these are lost.
- Even if the SPSS vesion supports strings up to 32767, the function
write.foreign()
stops if there are more than 255 in any variable.
- Theres a star (*) if any character variables are used, but newer SPSS versions cannot handle that.
- The CSV file is comma-separated and does (can) not use quotes, therefore no commas are allowed in strings (character)
- Non-ASCII caracters (e.g. umlauts) will crash the import
- Should you have a character that contains any NA value, you'll see...
... an error message like this:
Error in if (any(lengths > 255L)) stop("Cannot handle character variables longer than 255") :
missing value where TRUE/FALSE needed
I spent a lot of time with that and then found a good posting (http://r.789695.n4.nabble.com/SPSS-export-in-R-package-foreign-td921491.html) to start on and make it better. Here's my result, I'd like to share with you.
解决方案 This function is a replacement for foreign:write.foreign
to handle the issues stated above.
Note: To avoid issues with SPSS finding the CSV file, please specify the full path (!) at least for datafile
(also if using the original foreign:write.foreign()
).
Note: This script will replace a tabulator (TAB) and other spacing (incl. CR+LF) in strings by a blank without warning. One may consider using GET DATA
instead of the troublesome DATA LIST
to solve that limitation.
Note: There may be a warning In FUN(X[[i]], ...) : probable complete loss of accuracy in modulus
- this refers to counting the decimals and can be ignored.
Note: POSIXlt
and POSIXct
variables are not yet handled by the script properly.
writeForeignMySPSS = function (df, datafile, codefile, varnames = NULL, len = 32767) {
adQuote <- function (x) paste("\"", x, "\"", sep = "")
# Last variable must not be empty for DATA LIST
if (any(is.na(df[[length(df)]]))) {
df$END_CASE = 0
}
# http://stackoverflow.com/questions/5173692/how-to-return-number-of-decimal-places-in-r
decimalplaces <- function(x) {
y = x[!is.na(x)]
if (length(y) == 0) {
return(0)
}
if (any((y %% 1) != 0)) {
info = strsplit(sub('0+$', '', as.character(y)), ".", fixed=TRUE)
info = info[sapply(info, FUN=length) == 2]
if (length(info) >= 2) {
dec = nchar(unlist(info))[seq(2, length(info), 2)]
} else {
return(0)
}
return(max(dec, na.rm=T))
} else {
return(0)
}
}
dfn <- lapply(df, function(x) if (is.factor(x))
as.numeric(x)
else x)
# Boolean variables (dummy coding)
bv = sapply(dfn, is.logical)
for (v in which(bv)) {
dfn[[v]] = ifelse(dfn[[v]], 1, 0)
}
varlabels <- names(df)
# Use comments where applicable
for (i in 1:length(df)) {
cm = comment(df[[i]])
if (is.character(cm) && (length(cm) > 0)) {
varlabels[i] = comment(df[[i]])
}
}
if (is.null(varnames)) {
varnames <- abbreviate(names(df), 8L)
if (any(sapply(varnames, nchar) > 8L))
stop("I cannot abbreviate the variable names to eight or fewer letters")
if (any(varnames != varlabels))
warning("some variable names were abbreviated")
}
varnames <- gsub("[^[:alnum:]_\\$@#]", "\\.", varnames)
dl.varnames <- varnames
chv = sapply(df, is.character)
if (any(chv)) {
for (v in which(chv)) {
dfn[[v]] = gsub("\\s", " ", dfn[[v]])
}
lengths <- sapply(df[chv], function(v) max(nchar(v), na.rm=T))
if (any(lengths > len)) {
warning(paste("Clipped strings in", names(df[chv]), "to", len, "characters"))
for (v in which(chv)) {
df[[v]] = substr(df[[v]], start=1, stop=len)
}
}
lengths[is.infinite(lengths)] = 0
lengths[lengths < 1] = 1
lengths <- paste("(A", lengths, ")", sep = "")
# star <- ifelse(c(FALSE, diff(which(chv) > 1)), " *",
dl.varnames[chv] <- paste(dl.varnames[chv], lengths)
}
# decimals and bools
nmv = sapply(df, is.numeric)
dbv = sapply(df, is.numeric)
nv = (nmv | dbv)
decimals = sapply(df[nv], FUN=decimalplaces)
dl.varnames[nv] = paste(dl.varnames[nv], " (F", decimals+8, ".", decimals, ")", sep="")
if (length(bv) > 0) {
dl.varnames[bv] = paste(dl.varnames[bv], "(F1.0)")
}
rmv = !(chv | nv | bv)
if (length(rmv) > 0) {
dl.varnames[rmv] = paste(dl.varnames[rmv], "(F8.0)")
}
# Breaks in output
brv = seq(1, length(dl.varnames), 10)
dl.varnames[brv] = paste(dl.varnames[brv], "\n", sep=" ")
cat("SET LOCALE = ENGLISH.\n", file = codefile)
cat("DATA LIST FILE=", adQuote(datafile), " free (TAB)\n", file = codefile, append = TRUE)
cat("/", dl.varnames, " .\n\n", file = codefile, append = TRUE)
cat("VARIABLE LABELS\n", file = codefile, append = TRUE)
cat(paste(varnames, adQuote(varlabels), "\n"), ".\n", file = codefile,
append = TRUE)
factors <- sapply(df, is.factor)
if (any(factors)) {
cat("\nVALUE LABELS\n", file = codefile, append = TRUE)
for (v in which(factors)) {
cat("/\n", file = codefile, append = TRUE)
cat(varnames[v], " \n", file = codefile, append = TRUE)
levs <- levels(df[[v]])
cat(paste(1:length(levs), adQuote(levs), "\n", sep = " "),
file = codefile, append = TRUE)
}
cat(".\n", file = codefile, append = TRUE)
}
# Labels stored in attr()
attribs <- !unlist(lapply(sapply(df, FUN=attr, which="1"), FUN=is.null))
if (any(attribs)) {
cat("\nVALUE LABELS\n", file = codefile, append = TRUE)
for (v in which(attribs)) {
cat("/\n", file = codefile, append = TRUE)
cat(varnames[v], " \n", file = codefile, append = TRUE)
# Check labeled values
tc = list()
for (tcv in dimnames(table(df[[v]]))[[1]]) {
if (!is.null(tcl <- attr(df[[v]], tcv))) {
tc[tcv] = tcl
}
}
cat(paste(names(tc), tc, "\n", sep = " "),
file = codefile, append = TRUE)
}
cat(".\n", file = codefile, append = TRUE)
}
ordinal <- sapply(df, is.ordered)
if (any(ordinal)) {
tmp = varnames[ordinal]
brv = seq(1, length(tmp), 10)
tmp[brv] = paste(tmp[brv], "\n")
cat(paste("\nVARIABLE LEVEL", paste(tmp, collapse=" "), "(ORDINAL).\n"),
file = codefile, append = TRUE)
}
num <- sapply(df, is.numeric)
if (any(num)) {
tmp = varnames[num]
brv = seq(1, length(tmp), 10)
tmp[brv] = paste(tmp[brv], "\n")
cat(paste("\nVARIABLE LEVEL", paste(tmp, collapse=" "), "(SCALE).\n"),
file = codefile, append = TRUE)
}
cat("\nEXECUTE.\n", file = codefile, append = TRUE)
write.table(dfn, file = datafile, row = FALSE, col = FALSE,
sep = "\t", quote = F, na = "", eol = "\n", fileEncoding="UTF-8")
}
On the long term, the changes might be considered to be merged into the foreign
package. Unfortunately, the bug reporting system for the r-project is currently limited to previously registered developers.
这篇关于将R data.frame导出到SPSS的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!