class Update

Bug : 2014. 04. 05

Error: citations.dmp.trim line 32659: expected 7 columns of data but found 8
Actually this software does not use this table. Therefore, this error can be ignored.
Later, I should fix this.

Public Class Methods

new() click to toggle source
# File lib/ncbi_taxonomy_update.rb, line 9
def initialize
        @home_dir = Dir.home
        @work_dir = @home_dir + "/.ncbi_taxonomy"
        md5_file = @work_dir + "/taxdump.tar.gz.md5"
        md5_old_file = @work_dir + "/taxdump.tar.gz.md5.old"
        @taxdb = @work_dir + "/taxonomy.db.prep"
        @taxdb_release = @work_dir + "/taxonomy.db"
        @status = nil

        # check workinng directory, if not exist, make it.
        if File.exist?(@work_dir)
                if !File.directory?(@work_dir)
                        @status = "This software uses $HOME/.ncbi_taxonomy directory. However, in your home directory there is same name of file. We recommend you change that file name to another name."
                        return
                else
                        Dir.chdir @work_dir
                        if File.exist?(md5_file)
                                `rm -f #{md5_old_file}`
                                File.rename(md5_file, md5_old_file)
                                self.download_md5
                                `diff #{md5_file} #{md5_old_file}`
                                if $?.exitstatus == 0
                                        @status = false
                                        return
                                else
                                        @status = true
                                        return
                                end
                        else
                                self.download_md5
                        end
                end
        else
                Dir.mkdir @work_dir
                Dir.chdir @work_dir
                self.download_md5
        end

        @status = true
        return
end

Public Instance Methods

do() click to toggle source
# File lib/ncbi_taxonomy_update.rb, line 87
def do
        self.download_dump
        self.substitution
        self.load_db
        self.release
end
download_dump() click to toggle source
# File lib/ncbi_taxonomy_update.rb, line 59
def download_dump
        `curl -s https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz 2>/dev/null | tar zxf - `
end
download_md5() click to toggle source
# File lib/ncbi_taxonomy_update.rb, line 55
def download_md5
        `curl -s https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz.md5 > taxdump.tar.gz.md5 2>/dev/null`
end
load_db() click to toggle source
# File lib/ncbi_taxonomy_update.rb, line 94
        def load_db
                sql = <<EOF
PRAGMA page_size=4096;
PRAGMA main.locking_mode=EXCLUSIVE;

.separator '\t'

CREATE TABLE citations (
cit_id BIGINT,
cit_key VARCHAR(255),
pubmed_id BIGINT,
medline_id BIGINT,
ur LONGTEXT,
text LONGTEXT,
taxid_list LONGTEXT
);

CREATE TABLE delnodes (
tax_id BIGINT
);

CREATE TABLE division (
division_id     BIGINT,
division_cde VARCHAR(255),
division_name VARCHAR(255),
comments VARCHAR(255)
);

CREATE TABLE gencode (
genetic_code_id INT,
abbreviation VARCHAR(255),
name VARCHAR(255),
cde LONGTEXT,
starts LONGTEXT
);

CREATE TABLE merged (
old_tax_id BIGINT,
new_tax_id BIGINT
);

CREATE TABLE names (
tax_id BIGINT,
name_txt VARCHAR(255),
unique_name VARCHAR(255),
name_class VARCHAR(255)
);

CREATE TABLE nodes (
tax_id BIGINT,
parent_tax_id BIGINT,
rank VARCHAR(64),
embl_code VARCHAR(64),
division_id INTEGER,
inherited_div_flag BOOLEAN,
genetic_code_id INTEGER,
inherited_GC_flag BOOLEAN,
mitochondrial_genetic_code_id INTEGER,
inherited_MGC_flag BOOLEAN,
GenBank_hidden_flag BOOLEAN,
hidden_subtree_root_flag BOOLEAN,
comments VARCHAR(255)
);


CREATE INDEX citations_idx ON citations(cit_id,cit_key,pubmed_id,medline_id,ur,text,taxid_list);
CREATE INDEX delnodes_idx ON delnodes(tax_id);
CREATE INDEX division_idx ON division(division_id,division_cde,division_name,comments);
CREATE INDEX gencode_idx ON gencode(genetic_code_id,abbreviation,name,cde,starts);
CREATE INDEX merged_idx ON merged(old_tax_id,new_tax_id);
CREATE INDEX names_idx ON names(tax_id,name_txt,unique_name,name_class);
CREATE INDEX nodes_idx ON nodes(tax_id,parent_tax_id,rank,embl_code,division_id,inherited_div_flag,genetic_code_id,inherited_GC_flag,mitochondrial_genetic_code_id,inherited_MGC_flag,GenBank_hidden_flag,hidden_subtree_root_flag,comments);


.import citations.dmp.trim citations
.import delnodes.dmp.trim delnodes
.import division.dmp.trim division
.import gencode.dmp.trim gencode
.import merged.dmp.trim merged
.import names.dmp.trim names
.import nodes.dmp.trim nodes

EOF
                `echo "#{sql}" | sqlite3 #{@taxdb} < /dev/stdin`
        end
release() click to toggle source
# File lib/ncbi_taxonomy_update.rb, line 75
def release
        begin
                FileUtils.rm "#{@taxdb_release}.old"
        rescue Errno::ENOENT => e
        end
        begin
                File.rename(@taxdb_release, @taxdb_release+".old")
        rescue Errno::ENOENT => e
        end
        File.rename(@taxdb, @taxdb_release)
end
status() click to toggle source
# File lib/ncbi_taxonomy_update.rb, line 51
def status
        return @status
end
substitution() click to toggle source

substitute some characters

# File lib/ncbi_taxonomy_update.rb, line 64
def substitution
        Dir.entries(@work_dir).each do |file|
                if file =~ /dmp$/
                        #STDERR.puts "treating #{file}"
                        File.open(@work_dir+"/"+file+".trim","w") do |out|
                                out << File.open(@work_dir+"/"+file).read.force_encoding('iso-8859-1').encode('utf-8').gsub(/([^|])        ([^|])/,'\1 \2').gsub(/\t\|\t/,"\t").gsub(/\t\|$/,"").gsub(/\"/,"%22")
                        end
                end
        end
end