GDC API query to map HT-seq FPKM UUID to a Case UUID
2
2
Entering edit mode
6.4 years ago
mk ▴ 290

I have a bunch of FPKM normalized mRNA sequencing from TCGA-PAAD, which covers pancreatic cancer patients. I wrote a script which goes through that data and makes a big data frame with all 60k+ Ensemble mappings for all 180+ patients, so that I can analyze various hypotheses regarding the clinical data.

I have the clinical metadata in a separate table, which I obtained using the TCGABiolinks package.

The problem is that all the columns in the sequencing data table are "HT-seq FPKM UUIDs".

Now all I need to do is map the HT-seq FPKM UUID to a Case UUID (patient)

I need an API query for this, or a call to some TCGABiolinks function.

Thanks in advance for your help!

Ok I have followed the advice here with some modifications. I obtained a json encoded case and file manifest which take the form given below.

File manifest (first few records)

[{
  "file_name": "232f085b-6201-4e4d-8473-e592b8d8e16d.FPKM.txt.gz", 
  "data_format": "TXT", 
  "access": "open", 
  "data_category": "Transcriptome Profiling", 
  "file_size": 514027, 
  "cases": [
    {
      "project": {
        "project_id": "TCGA-PAAD"
      }, 
      "case_id": "620e0648-ec20-4a12-a6cb-5546fe829c77"
    }
  ], 
  "annotations": [
    {
      "annotation_id": "050203a0-12ab-5025-973d-e070d94f722b"
    }
  ]
},{
  "file_name": "b0159d01-f1eb-490d-875b-cfdabed6f529.FPKM.txt.gz", 
  "data_format": "TXT", 
  "access": "open", 
  "data_category": "Transcriptome Profiling", 
  "file_size": 515800, 
  "cases": [
    {
      "project": {
        "project_id": "TCGA-PAAD"
      }, 
      "case_id": "16b38977-aea1-4c75-89ec-4fb551f652dd"
    }
  ]
},{
  "file_name": "f2389819-b8fc-460e-821c-01dba313cce1.FPKM.txt.gz", 
  "data_format": "TXT", 
  "access": "open", 
  "data_category": "Transcriptome Profiling", 
  "file_size": 510184, 
  "cases": [
    {
      "project": {
        "project_id": "TCGA-PAAD"
      }, 
      "case_id": "23908554-b98e-4ff8-98e7-dee3e2c5feaf"
    }
  ]
},{

Cases manifest (first few records)

[{
  "diagnoses": [
    {
      "days_to_death": null
    }
  ], 
  "case_id": "33833131-1482-42d5-9cf5-01cade540234", 
  "submitter_id": "TCGA-2J-AAB4"
},{
  "diagnoses": [
    {
      "days_to_death": 738.0
    }
  ], 
  "case_id": "67e9abc1-4b6f-4054-bdc4-29906c55c682", 
  "submitter_id": "TCGA-3A-A9IC"
},{
  "diagnoses": [
    {
      "days_to_death": null
    }
  ], 
  "case_id": "a53c919a-4e08-46f1-af3f-30b16b597c33", 
  "submitter_id": "TCGA-IB-AAUU"
},{
  "diagnoses": [
    {
      "days_to_death": 278.0
    }
  ], 
  "case_id": "ab449860-46e5-485e-abd5-31c5abef2c58", 
  "submitter_id": "TCGA-L1-A7W4"
},{

Here is the code that was run:

rm(list = ls())
output_logfile <- file("log_output.txt", open="wt")
message_logfile <- file("log_message.txt", open="wt")
sink(file=output_logfile, type = "output")
sink(file=message_logfile, type = "message")
library(GSAR)
library(org.Hs.eg.db)
library(GSVAdata)
library(GSEABase)
library(GSVA)
library(dplyr)
data(c2BroadSets)
#### The following should work with data from the GDC, where we have a shopping cart
#### of zipped single patient sequencing data
#### Example workflow for a GDC-hosted study (TCGA-PAAD):
####    Go to the GDC homepage of the study: https://portal.gdc.cancer.gov/projects/TCGA-PAAD
####    Click on "Cases"
####    download the json manifest for cases
####    move it to the project directory
####    Click on "Files"
####    download the json manifest for files
####    move it to the project directory
####    download the actual data by selecting appropriate filters at left and clicking "add all files to cart"
####    unzip the downloaded archive and move the directory to the project folder


####    This Creates a Matrix with all our data
file.loc <- "PAAD-FPKM"  # the name of the data dir
file.manifest <- "files.json" # the name of the files manifest
cases.manifest <- "cases.json" # the name of the cases manifest
dsep <- "/"
files <- fromJSON(file=file.manifest)
cases <- fromJSON(file=cases.manifest)
dlfiles <- list.files(file.loc, recursive = TRUE)
#### remove the manifest, since we are using a separately downloaded json object
unlink(paste0(file.loc, dsep, "MANIFEST.txt"))
#for (file in 1:length(dlfiles)){
for (file in 1:5){
  if(!exists("rna_table")){
    ## get the patient barcode
    case_id = strsplit(dlfiles[file],"/")
    ## create a table with the expression profile where the count column is named with the patient barcode
    rna_table<-read.delim(paste0(file.loc,dsep,dlfiles[file]), sep="\t", col.names = c("ensemble_id",case_id[[1]][2]))
  } else {
    case_id = strsplit(dlfiles[file],"/")
    new_data<-read.delim(paste0(file.loc,dsep,dlfiles[file]), sep="\t", col.names = c("ensemble_id",case_id[[1]][2]))
    rna_table <- full_join(rna_table, new_data, by = NULL)
  }
}
rna_data <- as.matrix(rna_table)
rownames(rna_data) <- rna_data[,1]
rna_data <- rna_data[,-1]

####    This creates a Matrix of all the death dates
death_data <- matrix(nrow=ncol(rna_data),ncol=2)
colnames(death_data) <- c("filename", "days_to_death")

####    Do a sanity check
list.files(file.loc, recursive = TRUE)[1:10] # here are the files we read to get the read counts
str(death_data[1:5,]) # what our clinical data looks like
str(rna_data[1:5,1:5]) # what our sequencing data looks like

for (case in 1:ncol(rna_data)){
  case_id <- files[[grep(colnames(rna_data)[case],files)]]$cases[[1]]$case_id
  #days_to_death <- cases[grep(case_id, cases)][[1]]$diagnoses[[1]]$days_to_death
  death_data[case,1] <- colnames(rna_data)[case]
  #death_data[case,2] <- days_to_death
  death_data[case,2] <- case_id
}

Now the output (non errors) is fairly straightforward. Notice that I am running this on data from only the first 5 patients in the data folder in the interest of time:

> sink(file=message_logfile, type = "message")

> library(GSAR)

> library(org.Hs.eg.db)

> library(GSVAdata)

> library(GSEABase)

> library(GSVA)

> library(dplyr)

> data(c2BroadSets)

> #### The following should work with data from the GDC, where we have a shopping cart
> #### of zipped single patient sequencing data
> #### Example  .... [TRUNCATED] 

> file.manifest <- "files.json" # the name of the files manifest

> cases.manifest <- "cases.json" # the name of the cases manifest

> dsep <- "/"

> files <- fromJSON(file=file.manifest)

> cases <- fromJSON(file=cases.manifest)

> dlfiles <- list.files(file.loc, recursive = TRUE)

> #### remove the manifest, since we are using a separately downloaded json object
> unlink(paste0(file.loc, dsep, "MANIFEST.txt"))

> #for (file in 1:length(dlfiles)){
> for (file in 1:5){
+   if(!exists("rna_table")){
+     ## get the patient barcode
+     case_id = strsplit(dlfil .... [TRUNCATED] 

> rna_data <- as.matrix(rna_table)

> rownames(rna_data) <- rna_data[,1]

> rna_data <- rna_data[,-1]

> ####    This creates a Matrix of all the death dates
> death_data <- matrix(nrow=ncol(rna_data),ncol=2)

> colnames(death_data) <- c("filename", "days_to_death")

> ####    Do a sanity check
> list.files(file.loc, recursive = TRUE)[1:10] # here are the files we read to get the read counts
 [1] "005c0660-3700-40ea-b037-b456319d369a/bb15d7d0-8705-49af-89e4-fc13c01de642.FPKM.txt.gz"
 [2] "030cf06f-890c-4193-9c7d-254980c73a48/3d771128-9e90-49c2-8ee5-23d994ee6398.FPKM.txt.gz"
 [3] "03a162ee-0be2-484d-ad86-17bba311a3f8/4172e3f8-3578-4f33-9168-6f8c2b8d0783.FPKM.txt.gz"
 [4] "051918c1-9bb2-4146-bf85-4e4a55c5759e/5aed2227-1f31-4159-9eed-430bc45c61dc.FPKM.txt.gz"
 [5] "0882ecec-b533-4912-adc1-8ffd6eaa47c1/c19f102d-47a0-48c6-9443-63730d9ea6d1.FPKM.txt.gz"
 [6] "0ae4ff1f-e2d3-46e0-95a2-0ea80a4ebb63/574df2fc-a608-49c5-8e83-f26d03ef8bb3.FPKM.txt.gz"
 [7] "0c2840a2-3a49-4f22-ae21-1cfbb0034212/fef65b57-c58d-4050-8de4-f09f5cd616ce.FPKM.txt.gz"
 [8] "0dfe7aef-a105-4a32-89ca-49a30a1b59ed/65a45bca-b5d4-4763-a51f-f7b9ad9efcb9.FPKM.txt.gz"
 [9] "0e7871dc-a721-4dae-8938-28a73ec3f968/232f085b-6201-4e4d-8473-e592b8d8e16d.FPKM.txt.gz"
[10] "101e042e-efa2-4c6c-b629-55ecbde859d2/3de80dcb-4ff2-4125-b8e6-9e06ec1cd833.FPKM.txt.gz"

> str(death_data[1:5,]) # what our clinical data looks like
 logi [1:5, 1:2] NA NA NA NA NA NA ...
 - attr(*, "dimnames")=List of 2
  ..$ : NULL
  ..$ : chr [1:2] "filename" "days_to_death"

> str(rna_data[1:5,1:5]) # what our sequencing data looks like
 chr [1:5, 1:5] "3.009793e-03" "2.945653e+00" "0.000000e+00" "3.861741e+00" ...
 - attr(*, "dimnames")=List of 2
  ..$ : chr [1:5] "ENSG00000270112.3" "ENSG00000167578.15" "ENSG00000273842.1" "ENSG00000078237.5" ...
  ..$ : chr [1:5] "bb15d7d0.8705.49af.89e4.fc13c01de642.FPKM.txt.gz" "X3d771128.9e90.49c2.8ee5.23d994ee6398.FPKM.txt.gz" "X4172e3f8.3578.4f33.9168.6f8c2b8d0783.FPKM.txt.gz" "X5aed2227.1f31.4159.9eed.430bc45c61dc.FPKM.txt.gz" ...

> for (case in 1:ncol(rna_data)){
+   case_id <- files[[grep(colnames(rna_data)[case],files)]]$cases[[1]]$case_id
+   #days_to_death <- cases[grep(cas .... [TRUNCATED]

However, Somehow I am only able to locate the first data file in my files manifest. The last loop in the included source causes the following error to be recorded in my logged messages:

Joining, by = "ensemble_id"
Joining, by = "ensemble_id"
Joining, by = "ensemble_id"
Joining, by = "ensemble_id"
Error in files[[grep(colnames(rna_data)[case], files)]] : 
  attempt to select less than one element in get1index

Thanks everybody for your help and looking forward to what you have to say :)

R RNA-Seq TCGA GDC • 5.4k views
ADD COMMENT
5
Entering edit mode
6.4 years ago

Edit: 21st September 2018:

More rapid ways of looking up TCGA barcodes from UUIDs or file-names:

----------------------------------------

Hello,

With the TCGA data, it can indeed be difficult to just figure out which sample is which. A lot of time and effort has to be invested just to organise the data for a particular project. Here's how I did it for a recent TCGA dataset that I re-analysed:

In order to search for the Case UUID from these filenames:

  • download the manifest for your data in JSon format, from here
  • In R, get all of your HTseq FPKM count filenames in a vector or list called filenames (e.g. c("657e19a6-e481-4d06-8613-1a93677f3425.FPKM.txt.gz", "b244f324-fd8a-4d4b-b8f5-bad973c649d5.FPKM.txt.gz", ..., et cetera))
  • Loop through each filename and look up their Case UUID in the JSon file, with a loop like this:

.

require(rjson)
manifest <- fromJSON(file="RNAseqFPKM.json")
caseUUIDs <- c()

for (i in 1:length(filenames))
{
    record <- manifest[[grep(filenames[i], manifest, fixed=TRUE, ignore.case=FALSE)]]
    if (filenames[i]!=record$file_name)
    {
        print("FALSE")
    }
    caseUUIDs[i] <- record$cases[[1]]$case_id
}

I added the inner if statement to print 'FALSE' to screen if any file has no matching record (I never encountered such a situation).

The loop will take a while to run (maybe 5-10 minutes for 150 filenames) - I could possibly develop it further with lapply or mclappy but it's one of those nuisance bit of codings that I always put on the back burner and just tolerate.

Hope that this helps you out!

Kevin

ADD COMMENT
0
Entering edit mode

Hi Kevin, I've used the downloaded manifest strategy above with limited success. Can you have a look?

ADD REPLY
0
Entering edit mode

Hey, I see that you edited your original question with code - thanks!

The problem seems to be 2-fold:

  1. The filenames in the Json file have segments (within the name) separated by hyphens, not dots, e.g., bb15d7d0-8705-49af-89e4-fc13c01de642.FPKM.txt.gz and not bb15d7d0.8705.49af.89e4.fc13c01de642.FPKM.txt.gz. Your filenames have dots, so, they won't match.
  2. Some of your filenames have an 'X' at the beginning, which was added automatically by R if it sees a number at the start of the name

Both of these problems are related to the fact that your filenames were assigned as colnames, which in R means that you cannot have a hyphen in the colname, neither can the colname begin with a number. You'll have to save the filenames earlier / use an earlier listing of the filenames.

Hope that this makes sense?

Kevin

ADD REPLY
0
Entering edit mode

Hi Kevin, Those suggestions solved the problem. I ended up just renaming the columns in the RNA seq matrix generically, then building a lookup table for the file names. Now I have several questions about the analysis of this data and I created a second post to address these. Would you be willing to take a look?

Linking patient survival to gene-set analysis

ADD REPLY
0
Entering edit mode

Hi! Okay, I will take a look but not until later today (Saturday) or Sunday. I will give someone else a chance to take a look too.

ADD REPLY
2
Entering edit mode
6.4 years ago

Hi Matt,

I've in fact just done this but with a different TCGA dataset (SKCM). What I did was download the JSON dump of all the cases in your dataset (which, if I have understood correctly, in your case should be here:

https://portal.gdc.cancer.gov/repository?facetTab=files&filters=%7B%22op%22%3A%22and%22%2C%22content%22%3A%5B%7B%22op%22%3A%22in%22%2C%22content%22%3A%7B%22field%22%3A%22cases.project.project_id%22%2C%22value%22%3A%5B%22TCGA-PAAD%22%5D%7D%7D%2C%7B%22op%22%3A%22in%22%2C%22content%22%3A%7B%22field%22%3A%22files.analysis.workflow_type%22%2C%22value%22%3A%5B%22HTSeq%20-%20FPKM%22%5D%7D%7D%2C%7B%22op%22%3A%22in%22%2C%22content%22%3A%7B%22field%22%3A%22files.data_category%22%2C%22value%22%3A%5B%22Transcriptome%20Profiling%22%5D%7D%7D%5D%7D&searchTableTab=files

And click in JSON (Export all)).

Then, that file gives you the mapping from file ID to case UUID (patient). Object example:

{
  "file_name": "232f085b-6201-4e4d-8473-e592b8d8e16d.FPKM.txt.gz", 
  "data_format": "TXT", 
  "access": "open", 
  "data_category": "Transcriptome Profiling", 
  "file_size": 514027, 
  "cases": [
    {
      "project": {
        "project_id": "TCGA-PAAD"
      }, 
      "case_id": "620e0648-ec20-4a12-a6cb-5546fe829c77"
    }
  ], 
  "annotations": [
    {
      "annotation_id": "050203a0-12ab-5025-973d-e070d94f722b"
    }
  ]
}

I needed the TCGA Barcode so with the case ID I just did a query like this:

curl https://api.gdc.cancer.gov/cases/620e0648-ec20-4a12-a6cb-5546fe829c77?pretty=true

And grep for "submitter_id". ("submitter_id": "TCGA-HZ-7918").

Hope this helps! :)

Daniela

ADD COMMENT
0
Entering edit mode

Hi Daniela, I think I obtained the right manifest and it includes records with relevant parts of the data model, but somehow I am still failing to match my files to patient IDs. I have updated my post above. Can you take a look? Thanks for all your help!

ADD REPLY

Login before adding your answer.

Traffic: 2670 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6