Question

Download data from Ensembl database

0

Entering edit mode

8.6 years ago

Kumar ▴ 170

Hi, I have python script that able to download genome sequencing data from Ensembl plant database.

So, just I am trying to download Mouse genome data, but this script for only plant, how I change so I can download genome data from mammals also.

following is the script

#########################################################################################

#!/usr/bin/env python


# some examples of files and locations

# pub

# lrwxrwxrwx   1 ftpuser  ftpusers        30 Dec  7 16:33 current_homo_sapiens -> release-36/homo_sapiens_36_35i

#

#/pub/release-36/homo_sapiens_36_35i/data/fasta/dna

#-rw-rw-r--   1 ftpuser  ftpusers  67675771 Nov 15 14:48 Homo_sapiens.NCBI35.dec.dna.chromosome.1.fa.gz

#-rw-rw-r--   1 ftpuser  ftpusers  40802343 Nov 15 14:55 Homo_sapiens.NCBI35.dec.dna_rm.chromosome.1.fa.gz

#

#/pub/release-36/homo_sapiens_36_35i/data/fasta/pep

#-rw-rw-r--   1 ftpuser  ftpusers   3817861 Nov 15 19:46 Homo_sapiens.NCBI35.dec.pep.known.fa.gz

#

#/pub/release-36/homo_sapiens_36_35i/data/mysql/homo_sapiens_core_36_35i

#-rw-rw-r--   1 ftpuser  ftpusers   2957452 Dec  2 22:45 exon.txt.table.gz

#-rw-rw-r--   1 ftpuser  ftpusers   1747738 Dec  2 22:45 exon_stable_id.txt.table.gz

#-rw-rw-r--   1 ftpuser  ftpusers   1489045 Dec  2 22:45 exon_transcript.txt.table.gz

#-rw-rw-r--   1 ftpuser  ftpusers      4626 Dec  2 21:57 homo_sapiens_core_36_35i.mysql40_compatible.sql.gz

#-rw-rw-r--   1 ftpuser  ftpusers      4753 Dec  2 21:57 homo_sapiens_core_36_35i.sql.gz


import os, os.path, re, sys

from ftplib import FTP


class collect:

    def __init__(self): self.data = []

    def more(self, l): self.data.append(l)


def maybeRetrFile(fromPath, toPath):

    what = 'from %s --> to %s' %(fromPath, toPath)

    if os.path.exists(toPath):

        print 'skipping '+what

        return

    else:

        if toPath.endswith('.gz') and os.path.exists(toPath[:-3]):

            print 'skipping (uncompressed) '+what

            return

          

    print what

    toFile = open(toPath, 'w')

    ec.retrbinary('RETR '+fromPath, toFile.write, blocksize=100000)

    toFile.close()


target = sys.argv[1].strip().lower().replace(' ', '_')


release = 'current/'


if len(sys.argv) > 2:

 release = 'release-' + sys.argv[2] + '/'


# set up initial connection

host='ftp.ensemblgenomes.org'

print 'Logging into '+host

ec = FTP(host)

ec.login()


# look for target in a listing of pub

files = collect()

where='pub/'+release+'plants/'+'mysql'

print 'Listing '+where

ec.dir(where, files.more)

tEntries = [l for l in files.data if target+"_core_" in l and '->' not in l ]

if len(tEntries) != 1:

    print target + ' is either missing or not unique:'

    print tEntries

    print '\n'.join(files.data)

    ec.close()

    sys.exit(-1)


# "parse" current link name

curPat = re.compile(r''+target+'_core_(.+)_(.+)\Z')

tPath = tEntries[0].split()[-1]

mo = curPat.match(tPath)

if not mo:

    print 'dont\'t understand release naming scheme: '+ tPath

    ec.close()

    sys.exit(-1)

[maj, min] = mo.groups()

majMin=maj+'_'+min

outDir = target + '_' + majMin

  

print 'Release: '+release[0:len(release)-1]+', '+'tPath: '+tPath+', '+'target: '+target+', '+'maj: '+maj+', '+'majMin: '+majMin+', '+'outDir: '+outDir


## if os.path.exists(outDir):

##     print 'up to date: ' + tPath

##     ec.close()

##     sys.exit(0)


# need to get files. first, set up directories.

[dDir, mDir, pDir] = [outDir+d for d in ['/dna/', '/mysql/', '/pep/']]

if not os.path.exists(dDir): os.makedirs(dDir, 0744)

if not os.path.exists(mDir): os.makedirs(mDir, 0744)

if not os.path.exists(pDir): os.makedirs(pDir, 0744)


# retrieve dna

dnaPat = re.compile(r'\.dna(_rm)?\.chromosome\..+\.fa\.gz\Z')

dFiles = collect()

where = 'pub/'+release+'plants/fasta/%s/dna' % target

print 'Changing dir to '+where

ec.dir(where, dFiles.more)

dKeep = [l for l in dFiles.data if dnaPat.search(l)]

for f in dKeep:

    fn = f.split()[-1]

    maybeRetrFile(where+'/'+fn, dDir+fn)


# retrieve pep

where = 'pub/'+release+'plants/fasta/%s/pep' % target

pFiles = collect()

print 'Changing dir to '+where

ec.dir(where, pFiles.more)

for f in pFiles.data:

    fn = f.split()[-1]

    maybeRetrFile(where+'/'+fn, pDir+fn)


# retrieve mysql

# older releases?: mFiles = ['exon.txt.table', 'exon_transcript.txt.table', 'gene_stable_id.txt.table', 'seq_region.txt.table', 'transcript.txt.table', 'translation.txt.table', 'translation_stable_id.txt.table', target+'_core_'+majMin+'.sql', target+'_core_'+majMin+'.mysql40_compatible.sql']

#older releases which have *_stable_id.txt: mFiles = ['exon.txt', 'exon_transcript.txt', 'gene_stable_id.txt', 'seq_region.txt', 'transcript.txt', 'translation.txt', 'translation_stable_id.txt', target+'_core_'+majMin+'.sql']

mFiles = ['exon.txt', 'exon_transcript.txt', 'seq_region.txt', 'transcript.txt', 'translation.txt', target+'_core_'+majMin+'.sql']


where = 'pub/'+release+'plants/mysql/%s_core_%s' % (target, majMin)

print 'Changing dir to '+where

for mf in mFiles:

    maybeRetrFile(where+'/'+mf+'.gz', mDir+mf+'.gz')


# retrieve GTF

where = 'pub/'+release+'gtf/%s' % (target)

print 'Changing dir to '+where

gtfPat = re.compile(r'\.gtf\.gz\Z')

gFiles = collect()

ec.dir(where, gFiles.more)

gKeep = [l for l in gFiles.data if gtfPat.search(l)]

for f in gKeep:

    fn = f.split()[-1]

    maybeRetrFile(where+'/'+fn, mDir+fn)


ec.close()


print 'Processing Fetched Files'

#os.system('%s/processEnsemblFiles.sh %s' % (sys.path[0], outDir))

##############################################################################################################

genome • 4.3k views

ADD COMMENT • link updated 19 months ago by Ram 43k • written 8.6 years ago by Kumar ▴ 170

0

Entering edit mode

First describe what you exactly need. Genomic fasta files? Annotations as gtf? Proteins as fasta? Because dumping everything is pointless if you are not going to use it.

Second, from a first look at the script, it does not select any subset of species from plant DB. So you have to find a way to select genomes of interests from a list of species. The brain dead but doable way is to get text file containing just interesting for you latin names starting with all species list obtainable i.e. here: ftp://ftp.ensembl.org/pub/release-81/fasta/ (each directory name here).

Once you got it:

canis_familiaris
equus_caballus

It will be trivial to transform such list into wget commands:

wget -A "*.dna.toplevel.fa.gz" ftp://ftp.ensembl.org/pub/release-81/fasta/equus_caballus/dna/
wget -A "*.pep.all.fa.gz"   ftp://ftp.ensembl.org/pub/release-81/fasta/equus_caballus/pep/

ADD REPLY • link updated 19 months ago by Ram 43k • written 8.6 years ago by Darked89 4.6k

0

Entering edit mode

Thanks for reply. I run this script like

python ./fetchEnsemblFilesPlants.py Arabidopsis_thaliana

and it is working perfectly. But about mammals I changed some path of ensembl database, but it is showing error...

This script automatically downloading dna, rm_dna, .pep and exon MySQL files that I need..but it is working only for plant at ensembl database

ADD REPLY • link updated 19 months ago by Ram 43k • written 8.6 years ago by Kumar ▴ 170

0

Entering edit mode

There can be many errors this script can report when trying to fetch files from different ftp site with different structure. BTW, do you set up a local copy of mammalian ENSEMBL DB or something like this? Because I do not see the reasons that at the same time you need all individual chromosome fasta files, whole genome masked and unmasked, plus bunch of mysql tables. Edit the script post as an answer. Please note that it is not a proper answer, just the script you posted with few lines changed. I strongly suggest you debug it using saccharomyces_cerevisiae as target to download and not any mammalian sized entry.

ADD REPLY • link updated 19 months ago by Ram 43k • written 8.6 years ago by Darked89 4.6k

Ram · Accepted Answer · 2015-09-15

#!/usr/bin/env python

"""
Do not use as a solution. 
It is just a step towards getting working script: 
"""

import os, os.path, re, sys

from ftplib import FTP


#modify this if needed: 
host = 'ftp.ensembl.org'
release = 'release-81'  + '/'
#species i.e. equus_caballus as first arg after the script 
target = sys.argv[1].strip().lower().replace(' ', '_')

# set up initial connection

"""
if len(sys.argv) > 2:
    release = 'release-' + sys.argv[2] + '/'
"""

class collect:
    def __init__(self): self.data = []
    def more(self, l): self.data.append(l)

def maybeRetrFile(fromPath, toPath):
    what = 'from %s --> to %s' %(fromPath, toPath)
    if os.path.exists(toPath):
        print 'skipping '+what
        return
    else:
        if toPath.endswith('.gz') and os.path.exists(toPath[:-3]):
            print 'skipping (uncompressed) '+what
            return

    print what
    toFile = open(toPath, 'w')
    ec.retrbinary('RETR '+fromPath, toFile.write, blocksize=100000)
    toFile.close()


print 'Logging into '+host
ec = FTP(host)
ec.login()

# look for target in a listing of pub
files = collect()
where='pub/'+release +'mysql'
print 'Listing '+where
ec.dir(where, files.more)
tEntries = [l for l in files.data if target+"_core_" in l and '->' not in l ]

if len(tEntries) != 1:
    print target + ' is either missing or not unique:'
    print tEntries
    print '\n'.join(files.data)
    ec.close()
    sys.exit(-1)

# "parse" current link name
curPat = re.compile(r''+target+'_core_(.+)_(.+)\Z')
tPath = tEntries[0].split()[-1]
mo = curPat.match(tPath)

if not mo:
    print 'dont\'t understand release naming scheme: '+ tPath
    ec.close()
    sys.exit(-1)

[maj, min] = mo.groups()
majMin=maj+'_'+min
outDir = target + '_' + majMin

print 'Release: '+release[0:len(release)-1]+', '+'tPath: '+tPath+', '+'target: '+target+', '+'maj: '+maj+', '+'majMin: '+majMin+', '+'outDir: '+outDir

## if os.path.exists(outDir):
##     print 'up to date: ' + tPath
##     ec.close()
##     sys.exit(0)

# need to get files. first, set up directories.

[dDir, mDir, pDir] = [outDir+d for d in ['/dna/', '/mysql/', '/pep/']]
if not os.path.exists(dDir): os.makedirs(dDir, 0744)
if not os.path.exists(mDir): os.makedirs(mDir, 0744)
if not os.path.exists(pDir): os.makedirs(pDir, 0744)

# retrieve dna
dnaPat = re.compile(r'\.dna(_rm)?\.chromosome\..+\.fa\.gz\Z')
dFiles = collect()
where = 'pub/'+release+'/fasta/%s/dna' % target

print 'Changing dir to '+where
ec.dir(where, dFiles.more)
dKeep = [l for l in dFiles.data if dnaPat.search(l)]

for f in dKeep:
    fn = f.split()[-1]
    maybeRetrFile(where+'/'+fn, dDir+fn)

# retrieve pep
where = 'pub/'+release+'/fasta/%s/pep' % target
pFiles = collect()

print 'Changing dir to '+where

ec.dir(where, pFiles.more)
for f in pFiles.data:
    fn = f.split()[-1]
    maybeRetrFile(where+'/'+fn, pDir+fn)

# retrieve mysql
# remove anything you do not need from mFiles list below

mFiles = ['exon.txt', 'exon_transcript.txt', 'seq_region.txt', 'transcript.txt', 'translation.txt', target+'_core_'+majMin+'.sql']
where = 'pub/'+release+'/mysql/%s_core_%s' % (target, majMin)
print 'Changing dir to '+where

for mf in mFiles:
    maybeRetrFile(where+'/'+mf+'.gz', mDir+mf+'.gz')

# retrieve GTF
where = 'pub/'+release+'gtf/%s' % (target)
print 'Changing dir to '+where

gtfPat = re.compile(r'\.gtf\.gz\Z')
gFiles = collect()
ec.dir(where, gFiles.more)
gKeep = [l for l in gFiles.data if gtfPat.search(l)]

for f in gKeep:
    fn = f.split()[-1]
    maybeRetrFile(where+'/'+fn, mDir+fn)
ec.close()

print 'Processing Fetched Files'

#os.system('%s/processEnsemblFiles.sh %s' % (sys.path[0], outDir))