make.zdata<-function(db, table, factors=9){ rval<-dbGetQuery(db,sqlsubst("select * from %%tbl%% limit 1", list(tbl=table))) if (length(factors)==0) return(rval[FALSE,]) rval<-as.list(rval) ## lists are faster if(is.character(factors)){ for(f in factors){ levs<- dbGetQuery(db,sqlsubst("select distinct %%v%% from %%tbl%% order by %%v%%", list(v=f,tbl=table)))[[1]] rval[[f]]<-factor(rval[[f]], levels=levs) } class(rval)<-"data.frame" return(rval[FALSE,]) } else { ##numeric limit on levels for(f in names(rval)){ if (!is.numeric(factors) || length(factors)>1) stop("invalid specification of 'factors'") levs<- dbGetQuery(db, sqlsubst("select distinct %%v%% from %%tbl%% order by %%v%% limit %%n%%", list(v=f, tbl=table, n=factors+2)))[[1]] if (length(na.omit(levs))<=factors) rval[[f]]<-factor(rval[[f]], levels=levs) } class(rval)<-"data.frame" return(rval[FALSE,]) } } close.sqldataset<-function(con, tidy=TRUE,...){ gc() ## make sure any dead model matrices are finalized. if(!isIdCurrent(con$conn)){ warning("connection already closed") return(TRUE) } if (tidy){ pending<-dbListResults(con$conn) if (length(pending)) lapply(pending, dbClearResult) } dbDisconnect(con$conn) } open.sqldataset<-function(con, db=NULL, ...){ library(RSQLite) sqlite<-dbDriver("SQLite") if (is.null(con$dbname)) stop("the dataset used a temporary database") if (is.null(db)){ if(!file.exists(con$dbname)) stop("database file",con$dbname,"not found") con$conn<-dbConnect(sqlite, dbname=con$dbname) }else{ if (isIdCurrent(db)) con$conn<-db else stop("'db' is an expired connection") } if (!is.null(con$subset)){ con$subset$conn<-con$conn } con } open.sqlmodelmatrix<-function(con, data,...){ library(RSQLite) if (isIdCurrent(data$conn)) con$conn<-data$conn else stop("the dataset has an expired connection") con } finalizeSubset<-function(e){ dbGetQuery(e$conn, sqlsubst("drop table %%tbl%%",list(tbl=e$table))) } sqlexpr<-function(expr, data){ nms<-new.env(parent=emptyenv()) assign("%in%"," IN ", nms) assign("&", " AND ", nms) assign("|"," OR ", nms) out <-textConnection("str","w",local=TRUE) inorder<-function(e){ if(length(e) ==1) { if (is.character(e)) cat(adquote(e),file=out) else cat(e, file=out) } else if (length(e)==2){ nm<-deparse(e) if (exists(nm, nms)) nm<-get(nms) cat(deparse(e), file=out) } else if (deparse(e[[1]])=="c"){ cat("(", file=out) for(i in seq_len(length(e[-1]))) { if(i>1) cat(",", file=out) inorder(e[[i+1]]) } cat(")", file=out) } else { cat("(",file=out) inorder(e[[2]]) nm<-deparse(e[[1]]) if (exists(nm,nms)) nm<-get(nm,nms) cat(nm,file=out) inorder(e[[3]]) cat(")",file=out) } } inorder(expr) close(out) paste("(",str,")") } subset.sqldataset<-function(x,subset,...){ subset<-substitute(subset) rval<-new.env() rval$subset<-sqlexpr(subset) rval$table<-basename(tempfile("_sbs_")) rval$idx<-basename(tempfile("_idx_")) rval$weights<-"_subset_weight_" query<-sqlsubst("create table %%tbl%% as select %%key%%, %%wt%%*%%subset%% as _subset_weight_ from %%base%%", list(tbl=rval$table, key=x$key, wt=x$weights, subset=rval$subset, base=x$table ) ) dbGetQuery(x$conn, query) dbGetQuery(x$conn,sqlsubst("create unique index %%idx%% on %%tbl%%(%%key%%)", list(idx=rval$idx,tbl=rval$table, key=x$key))) rval$conn<-x$conn reg.finalizer(rval, finalizeSubset) x$subset<-rval x$call<-sys.call(-1) x } sqldataset<-function(data, weights="1", table.name=basename(tempfile("_tbl_")), key="row_names"){ library(RSQLite) sqlite<-dbDriver("SQLite") if (is.character(data) && length(data)==1) db<-dbConnect(sqlite,dbname=data) else{ db<-dbConnect(sqlite) dbWriteTable(db,table.name,data) } if (is.character(data) && length(data)==1){ zdata<-make.zdata(db,table.name) } else { zdata<-data[numeric(0),] names(zdata)<-make.db.names(db,names(zdata)) dbGetQuery(db, sqlsubst("create unique index %%idx%% on %%tbl%%(%%key%%)", list(idx=basename(tempfile("idx")), tbl=table.name,key=key))) } rval<-list(conn=db, table=table.name, weights=weights, call=sys.call(), zdata=zdata, key=key ) if (is.character(data) && length(data)==1) rval$dbname<-data else rval$dbname<-NULL class(rval)<-"sqldataset" rval } sqlsubst<-function(strings, values){ for(nm in names(values)){ if (is.null(values[[nm]])) next if (length(values[[nm]])>1) values[[nm]]<-paste(values[[nm]],collapse=", ") strings<-gsub(paste("%%",nm,"%%",sep=""),values[[nm]], strings) } strings } print.sqldataset<-function(x,...){ cat("SQLite data set:\n") print(x$call) invisible(x) } dim.sqldataset<-function(x){ if(is.null(x$subset)) nrows<-dbGetQuery(x$conn, sqlsubst("select count(*) from %%table%%", list(table=x$table)))[[1]] else nrows<-dbGetQuery(x$conn, sqlsubst("select count(*) from %%table%% where %%wt%%>0", list(table=x$subset$table, wt=x$subset$weights)))[[1]] ncols<-ncol(x$zdata) c(nrows,ncols) } sqllm<-function(formula, data){ tms<-terms(formula) yname<-as.character(attr(tms,"variables")[[2]]) ## handle subpopulations if (is.null(data$subset)){ tablename<-data$table wtname<-data$weights } else { tablename<-sqlsubst(" %%tbl%% inner join %%subset%% using(%%key%%) ", list(tbl=data$table, subset=data$subset$table, key=data$key)) wtname<-data$subset$weights } mm<-sqlmodelmatrix(formula, data, fullrank=TRUE) termnames<-mm$terms tablename<-sqlsubst("%%tbl%% inner join %%mm%% using(%%key%%)", list(tbl=tablename,mm=mm$table, key=data$key)) p<-length(termnames) n<-dim(data)[1] mfy<-basename(tempfile("_y_")) sumxy<-paste("sum(",termnames,"*%%mfy%%*%%wt%%) as _xy_",termnames,sep="") sumsq<-outer(termnames,termnames, function(i,j) paste("sum(",i,"*",j,"*%%wt%%)",sep="")) qxwx<-sqlsubst("select %%sumsq%% from %%table%%" , list(sumsq=sumsq, table=tablename, wt=wtname) ) xwx<-matrix(as.matrix(dbGetQuery(data$conn, qxwx)),p,p) qxwy <- sqlsubst("select %%sumxy%% from %%tablename%% inner join (select %%y%% as %%mfy%%, %%key%% from %%mf%%) using(%%key%%)", list(sumxy = sumxy, y = yname, key = data$key, tablename = tablename, mf=mm$mf, wt=wtname, mfy=mfy)) xwy<-drop(as.matrix(dbGetQuery(data$conn, qxwy))) beta<-solve(xwx,xwy) rtab<-basename(tempfile("_rt_")) muname<-basename(tempfile("_mu_")) qmu<-paste("(",paste(termnames,"*",formatC(beta,format="fg",digits=16),collapse="+"),") as ",muname) qrtab<-sqlsubst("create table %%rtab%% as select %%y%%, %%qmu%%, %%key%% from (select %%y%%, %%key%% from %%mf%%) inner join %%mm%% using(%%key%%)", list(rtab=rtab,y=yname, key=data$key,qmu=qmu,mf=mm$mf,mm=mm$table)) dbGetQuery(data$conn, qrtab) on.exit(dbGetQuery(data$conn, paste("drop table ",rtab)),add=TRUE) xwxinv<-solve(xwx) qsigma2 <- sqlsubst("select sum((%%y%%-%%mu%%)*(%%y%%-%%mu%%)*%%wt%%) as sse, sum(%%wt%%) as sumwt from %%rtab%% inner join (select %%key%%, %%wt%% from (select * from %%table%%)) using(%%key%%)", list(y=yname, mu=muname, rtab=rtab, table=tablename,wt=wtname, key=data$key)) sigma2<-dbGetQuery(data$conn, qsigma2) v<-(xwxinv*sigma2$sumwt/n)* (sigma2$sse[1]/sigma2$sumwt[1])*(n)/(n-p) names(beta)<-termnames dimnames(v)<-list(termnames, termnames) attr(beta, "var")<-v beta } dim.sqlmm<-function(x){ n<-dbGetQuery(x$conn, sqlsubst("select count(*) from %%table%%",list(table=x$table)))[[1]] p<-length(x$terms) c(n,p) } adquote<-function(s) paste("\"",s,"\"",sep="") sqlmodelmatrix<-function(formula, data, fullrank=TRUE){ mmcol<-function(variables,levels, name.only=FALSE){ if (length(variables)==0){ if(name.only) return("_Intercept_") else return("1 as _Intercept_") } rval<-paste(variables,"==",adquote(levels),sep="") termname<-paste(variables,levels,sep="") if (length(rval)>1){ rval<-paste(paste("(",rval,")",sep=""),collapse="*") termname<-paste(termname,collapse="_") } if (name.only) make.db.names(data$conn, termname) else paste(rval,"as",make.db.names(data$conn, termname)) } if (!all(all.vars(formula) %in% names(data$zdata))) stop("some variables not in database") ok.names<-c("~","I","(","-","+","*") if (!all( all.names(formula) %in% c(ok.names, names(data$zdata)))) stop("unsupported transformations in formula") tms<-terms(formula) mf<-model.frame(formula,data$zdata,na.action=na.pass) mm<-model.matrix(tms, mf) ntms<-max(attr(mm,"assign")) mftable<-basename(tempfile("_mf_")) mmtable<-basename(tempfile("_mm_")) dbGetQuery(data$conn, sqlsubst("create table %%mf%% as select %%key%%, %%vars%% from %%table%%", list(mf=mftable, vars=all.vars(formula), table=data$table, id=data$id, strata=data$strata,key=data$key))) dbGetQuery(data$conn, sqlsubst("create unique index %%idx%% on %%tbl%%(%%key%%)", list(idx=basename(tempfile("idx")), tbl=mftable, key=data$key))) patmat<-attr(tms,"factors") nms<-attr(tms,"term.labels") orders<-attr(tms, "order") if (fullrank) contrastlevels<-function(f) {levels(f)[-1]} else contrastlevels<-levels mmterms<-lapply(1:ntms, function(i){ vars<-rownames(patmat)[as.logical(patmat[,nms[i]])] if (orders[i]==1 && is.null(levels(mf[[vars]]))) return(list(paste(vars," as _",vars,sep=""))) levs<-as.matrix(expand.grid(lapply(mf[vars],contrastlevels))) lapply(split(levs,row(levs)), function(ll) mmcol(vars,ll)) }) if (fullrank) mmterms<-c(mmterms, list(mmcol(NULL,NULL))) mmnames<-lapply(1:ntms, function(i){ vars<-rownames(patmat)[as.logical(patmat[,nms[i]])] if (orders[i]==1 && is.null(levels(mf[[vars]]))) return(list(paste("_",vars,sep=""))) levs<-as.matrix(expand.grid(lapply(mf[vars],contrastlevels))) lapply(split(levs,row(levs)), function(ll) mmcol(vars,ll,TRUE)) }) if (fullrank) mmnames<-c(mmnames, list(mmcol(NULL,NULL,TRUE))) mmquery<-sqlsubst("create table %%mm%% as select %%key%%, %%terms%% from %%mf%%", list(mm=mmtable, id=data$id, strata=data$strata, terms=unlist(mmterms), mf=mftable, key=data$key)) dbGetQuery(data$conn, mmquery) dbGetQuery(data$conn, sqlsubst("create unique index %%idx%% on %%tbl%%(%%key%%)", list(idx=basename(tempfile("idx")), tbl=mmtable, key=data$key))) rval<-new.env(parent=emptyenv()) rval$table<-mmtable rval$mf<-mftable rval$formula<-formula rval$terms<-unlist(mmnames) rval$call<-sys.call() rval$conn<-data$conn reg.finalizer(rval, sqlmmDrop) class(rval)<-"sqlmm" rval } sqlmmDrop<-function(mmobj){ dbGetQuery(mmobj$conn,sqlsubst("drop table %%mm%%", list(mm=mmobj$table))) dbGetQuery(mmobj$conn,sqlsubst("drop table %%mf%%", list(mf=mmobj$mf))) invisible(NULL) } head.sqlmm<-function(x,n=6,...) dbGetQuery(x$conn, sqlsubst("select * from %%mm%% limit %nn%", list(mm=x$table,nn=n)))