9.8 years ago
@Avsmith1474
As I mentioned in my previous answer, the packages mentioned above are good at set models, but can't alway do exactly the analysis you'd like to do. When doing that, I typically work via netcdf stored versions of the genotypes. I don't work this way unless necessary, as it is MUCH , MUCH slower compared to these optimized packages which typically can read all the genotypes as one go.
As you've requested an example on how to do this, I've put some code. (I did edit this slightly to remove some local stuff, and I didn't test the edited version. Should be close to working, still).
This code loads imputations from MACH
, but hopefully the logic is apparent enough that you can modify as needed.
library(ncdf)
load.dosages.ncdf <- function(chromosome) {
require(ncdf)
imputed <- sub("@", as.character(chromosome), "chr@.dose")
idfile <- "sampleIDs.txt"
ids <- read.table(idfile, header = TRUE)
snpfile <- sub("@", as.character(chromosome), "chr@.info")
snps <- read.table(snpfile, header = TRUE)
#Number of SNPs
NSNPS <- nrow(snps)
#Number of Inds
NID <- nrow(ids)
# Create the dimensions for SNPs and Inds (2D netcdf)
snpdim <- dim.def.ncdf("snp", "bases", 1:NSNPS)
sampledim <- dim.def.ncdf("id", "th", 1:NID, unlim = TRUE)
# Variatble for IDs (NOTE: netcdf is not good at holding
# text, so only storing integernames)
varID <- var.def.ncdf("sampleID", "person", dim = sampledim,
missval = 0, prec = "integer")
# Variable for SNPs (NOTE: this will be the position of the
# SNP)
varSNP <- var.def.ncdf("pos", "bp", dim = snpdim, missval = 0,
prec = "integer")
# Create dimension for dosasges. Float used to save space
# in files
vargenotype <- var.def.ncdf("dose", "dose", dim = list(snpdim,
sampledim), missval = -1, prec = "float")
# Holderss for alleles, will be entered in bytye
varrefallele <- var.def.ncdf("ref", "base", dim = snpdim,
missval = 0, prec = "byte")
varaltallele <- var.def.ncdf("alt", "base", dim = snpdim,
missval = 0, prec = "byte")
# rsID for SNPs, stored as integers
varsnpid <- var.def.ncdf("rsid", "base", dim = snpdim, missval = 0,
prec = "integer")
# Store both MAF and EAF for tracking.
# Would be easy to calculate on the fly, but kept here from
# laziness
vareaf <- var.def.ncdf("EAF", "freq", dim = snpdim, missval = -1,
prec = "float")
varmaf <- var.def.ncdf("MAF", "freq", dim = snpdim, missval = -1,
prec = "float")
# Track inmputation quality statistics
varqual <- var.def.ncdf("Quality", "qual", dim = snpdim,
missval = -1, prec = "float")
varrsq <- var.def.ncdf("Rsq", "rsq", dim = snpdim, missval = -1,
prec = "float")
# Flag for whether the SNPs is imputed or not
varimp <- var.def.ncdf("usedimp", "imp", dim = snpdim, missval = -1,
prec = "byte")
# Create output file name
outfname <- sub("@", as.character(chromosome), "chr@.mldose.ncdf")
# Create output ncdf file
outfile <- create.ncdf(outfname, list(varID, varSNP, varrefallele,
varaltallele, varsnpid, vareaf, varmaf, varqual, varrsq,
vargenotype, varimp), verbose = FALSE)
# Load details on SNPs
put.var.ncdf(outfile, varSNP, as.integer(snps$POS), start = 1,
count = NSNPS)
put.var.ncdf(outfile, varsnpid, as.integer(sub("^(rs|is)",
"", snps$SNP)), start = 1, count = NSNPS)
put.var.ncdf(outfile, varrefallele, match(snps$Al1, c("A",
"C", "G", "T"), nomatch = 0), start = 1, count = NSNPS)
put.var.ncdf(outfile, varaltallele, match(snps$Al2, c("A",
"C", "G", "T", "-"), nomatch = 0), start = 1, count = NSNPS)
put.var.ncdf(outfile, vareaf, snps$Freq1, start = 1, count = NSNPS)
put.var.ncdf(outfile, varmaf, snps$MAF, start = 1, count = NSNPS)
put.var.ncdf(outfile, varqual, snps$Quality, start = 1, count = NSNPS)
put.var.ncdf(outfile, varrsq, snps$Rsq, start = 1, count = NSNPS)
# Load details on IDs
put.var.ncdf(outfile, varID, ids$IID, start = 1, count = NID)
# File with genotypes
genotypes <- file(imputed, "r")
print(paste(NSNPS, "SNPs to process"))
# Iterate over file to load
for (i in 1:NID) {
# Scan in a single line
snpi <- scan(genotypes, what = character(0), nlines = 1,
quiet = TRUE, skip = 0)
# Load the dosages (from MACH imputation, skip the first
# two entries. File has one line per ind)
put.var.ncdf(outfile, vargenotype, vals = as.double(snpi[3:length(snpi)]),
start = c(1, i), count = c(NSNPS, 1))
# Tracking the results
if (!(i%%100))
print(paste("individual #", i, " of ", NID, sep = ""))
}
close(genotypes)
close(outfile)
}
# Loop over the chromosomes to load
for (i in 1:23) {
print(paste("Starting: chr", i, sep = ""))
load.dosages.ncdf(i)
print(warnings())
}
Then, once all these files are created, the following code does a Cox Regression. (I realize this is a model supported by ProbABEL
, but it gives you a sense of how to work with netcdf files.)
library(ncdf)
library(splines)
library(survival)
library(foreign)
FILE <- "data.txt"
pheno <- read.table(FILE, header = T)
# Function to run COX regression
# This the main thing to change when running
# different models
coxfitter <- function(geno, phenodf, formula, init) {
environment(formula) <- environment()
formula <- update(formula, . ~ geno + .)
model <- coxph(formula, init = init, data = phenodf)
beta <- coef(model)["geno"]
se <- coef(summary(model))["geno", "se(coef)"]
pz <- coef(summary(model))["geno", "Pr(>|z|)"]
return(c(beta = beta, se = se, n = model$n, pz = pz))
}
# Loop over the chromosome.
# Current test form only does chr22
# Change to 'for (CHR in 1:22){' for entire genome
for (CHR in 1:23) {
# Open NetCDF file
genonc <- open.ncdf(sub("@", as.character(CHR), "chr@.mldose.ncdf"))
# Get list of SNPs
SNPs <- get.var.ncdf(genonc, "rsid")
# Get list of IDs
IDs <- get.var.ncdf(genonc, "sampleID")
# Number of SNPs
NSNP <- dim(SNPs)
# Number of Samples
NID <- dim(IDs)
# Placeholders for results
beta <- rep(NA, NSNP)
se <- rep(NA, NSNP)
n <- rep(NA, NSNP)
pz <- rep(NA, NSNP)
ref <- c("A", "C", "G", "T")[get.var.ncdf(genonc, "ref")]
alt <- c("A", "C", "G", "T")[get.var.ncdf(genonc, "alt")]
id <- get.var.ncdf(genonc, "rsid")
pos <- get.var.ncdf(genonc, "pos")
# Used to compare order in dosages and input samples
keep <- match(pheno$IID, IDs)
# Basic model
m <- coxph(Surv(FOLLOWUP, PHENO) ~ SEX + AGE, data = pheno)
formula <- formula(m)
init <- c(0, coef(m))
# Get the dosages for the current chromosome
for (i in 1:NSNP) {
dose <- get.var.ncdf(genonc, "dose", start = c(i, 1),
count = c(1, NID))
dose <- dose[keep]
f <- coxfitter(dose, pheno, formula, init)
beta[i] <- f[1]
se[i] <- f[2]
n[i] <- f[3]
pz[i] <- f[4]
}
# Write out the results
chrom <- rep(CHR, NSNP)
results <- data.frame(chr = chrom, snp = paste("rs", id,
sep = ""), pos, ref, alt, strand = "+", n, beta = signif(beta,
5), se = signif(se, 5), p = signif(pz, 5))
write.table(results, paste(PHENO, ".coxph.", "chr", CHR,
".results", sep = ""), sep = "\t", quote = FALSE, row.names = FALSE)
}
Note: I used ncdf package 1.8 in R. The CRAN repository version (1.6.x) has had a bug where missing values of byte variables were not coming as NA in R. Because of that I moved to 1.8. The 1.6 has been bumped since I found the bug, so I'm not sure whether the bug still exists in 1.6.5 now. The author of the package recommended I use 1.8. R ncdf 1.8 can be found here.
Have you tried Beagle and PRESTO? http://faculty.washington.edu/browning/beagle/beagle.html
Looked through all the beagle utilities and didn't see much that would help.