首页 > 代码库 > 从gff3文件中获取fasta文件
从gff3文件中获取fasta文件
chr1A NRGenome gene 1157233 1158291 . + . ID=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.path1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096860.1chr1A NRGenome mRNA 1157233 1158291 . + . ID=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096860.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.path1;coverage=100.0;identity=100.0;matches=1059;mismatches=0;indels=0;unknowns=0chr1A NRGenome exon 1157233 1158291 100 + . ID=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1.exon1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096860.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096860.1 1 1059 +chr1A NRGenome CDS 1157233 1158291 100 + 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1.cds1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096860.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096860.1 1 1059 +chr1A NRGenome gene 1159521 1162591 . - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.path1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1chr1A NRGenome mRNA 1159521 1162591 . - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.path1;coverage=100.0;identity=100.0;matches=1527;mismatches=0;indels=0;unknowns=0chr1A NRGenome exon 1162250 1162591 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1 342 +chr1A NRGenome exon 1161953 1162150 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon2;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 343 540 +chr1A NRGenome exon 1161682 1161859 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon3;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 541 718 +chr1A NRGenome exon 1161377 1161547 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon4;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 719 889 +chr1A NRGenome exon 1160679 1160710 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon5;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 890 921 +chr1A NRGenome exon 1160535 1160577 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon6;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 922 964 +chr1A NRGenome exon 1160392 1160459 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon7;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 965 1032 +chr1A NRGenome exon 1160086 1160127 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon8;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1033 1074 +chr1A NRGenome exon 1159521 1159973 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon9;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1075 1527 +chr1A NRGenome CDS 1162250 1162591 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1 342 +chr1A NRGenome CDS 1161953 1162150 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds2;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 343 540 +chr1A NRGenome CDS 1161682 1161859 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds3;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 541 718 +chr1A NRGenome CDS 1161377 1161547 100 - 1 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds4;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 719 889 +chr1A NRGenome CDS 1160679 1160710 100 - 1 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds5;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 890 921 +chr1A NRGenome CDS 1160535 1160577 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds6;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 922 964 +chr1A NRGenome CDS 1160392 1160459 100 - 1 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds7;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 965 1032 +chr1A NRGenome CDS 1160086 1160127 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds8;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1033 1074 +chr1A NRGenome CDS 1159521 1159973 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds9;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1075 1527 +chr1A NRGenome gene 1159521 1162591 . - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.path1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3chr1A NRGenome mRNA 1159521 1162591 . - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.path1;coverage=100.0;identity=100.0;matches=1434;mismatches=0;indels=0;unknowns=0chr1A NRGenome exon 1162546 1162591 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 1 46 +chr1A NRGenome exon 1162250 1162452 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon2;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 47 249 +chr1A NRGenome exon 1161953 1162150 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon3;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 250 447 +chr1A NRGenome exon 1161682 1161859 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon4;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 448 625 +chr1A NRGenome exon 1161377 1161547 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon5;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 626 796 +chr1A NRGenome exon 1160679 1160710 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon6;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 797 828 +chr1A NRGenome exon 1160535 1160577 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon7;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 829 871 +chr1A NRGenome exon 1160392 1160459 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon8;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 872 939 +chr1A NRGenome exon 1160086 1160127 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon9;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 940 981 +chr1A NRGenome exon 1159521 1159973 100 - . ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon10;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 982 1434 +chr1A NRGenome CDS 1162546 1162591 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 1 46 +chr1A NRGenome CDS 1162250 1162452 100 - 1 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds2;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 47 249 +chr1A NRGenome CDS 1161953 1162150 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds3;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 250 447 +chr1A NRGenome CDS 1161682 1161859 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds4;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 448 625 +chr1A NRGenome CDS 1161377 1161547 100 - 1 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds5;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 626 796 +chr1A NRGenome CDS 1160679 1160710 100 - 1 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds6;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 797 828 +chr1A NRGenome CDS 1160535 1160577 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds7;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 829 871 +chr1A NRGenome CDS 1160392 1160459 100 - 1 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds8;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 872 939 +chr1A NRGenome CDS 1160086 1160127 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds9;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 940 981 +chr1A NRGenome CDS 1159521 1159973 100 - 0 ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds10;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 982 1434 +
python代码
#!/usr/bin/env python# -*- coding: utf-8 -*-from Bio import SeqIO# fasta = open("/data2/masw_data/seqdb/chr1A.fasta", "rU")record_dict = SeqIO.index("/data2/masw_data/seqdb/chr1A.fasta", "fasta")gene_sequence = open(‘gene.fasta‘, ‘w‘)mRNA_sequence = open(‘mRNA.fasta‘, ‘w‘)exon_sequence = open(‘exon.fasta‘, ‘w‘)CDS_sequence = open(‘CDS.fasta‘,‘w‘)pro_downstream = open(‘pro_and_downstream.fasta‘, ‘w‘)gene = {}mRNA = {}exon = {}CDS = {}with open(‘1.txt‘, ‘r‘) as f: for line in f: line1 = line.strip().split() chr = line1[0] feature = line1[2] start = line1[3] end = line1[4] direction = line1[6] name = line1[8].split(";")[1][5:] if feature == ‘gene‘: gene[name] = (chr, start, end, direction) if line1[2] == ‘mRNA‘: mRNA[name] = (chr, start, end, direction) if line1[2] == ‘exon‘: if exon.get(name, None): exon[name].extend([(chr, start, end, direction)]) else: exon[name] = [(chr, start, end, direction)] if line1[2] == ‘CDS‘: if CDS.get(name, None): CDS[name].extend([(chr, start, end, direction)]) else: CDS[name] = [(chr, start, end, direction)]# get gene sequence include intronsfor key, value in gene.items(): if value[3] == ‘+‘: gene_sequence.write(‘>%s\n%s\n‘ % (key, record_dict[value[0]][int(value[1])-1:int(value[2])].seq)) if value[3] == ‘-‘: gene_sequence.write(‘>%s\n%s\n‘ % (key, record_dict[value[0]][int(value[1]) - 1:int(value[2])].seq.reverse_complement()))# get mRNA sequence include intronsfor key, value in mRNA.items(): if value[3] == ‘+‘: mRNA_sequence.write(‘>%s\n%s\n‘ % (key, record_dict[value[0]][int(value[1])-1:int(value[2])].seq)) if value[3] == ‘-‘: mRNA_sequence.write(‘>%s\n%s\n‘ % (key, record_dict[value[0]][int(value[1]) - 1:int(value[2])].seq.reverse_complement()))# get 2k upstream,1k downstream and gene sequencefor key, value in gene.items(): if value[3] == ‘+‘: pro_downstream.write(‘>%s\n%s\n‘ % (key, record_dict[value[0]][int(value[1])-2001:int(value[2]) + 1000].seq)) if value[3] == ‘-‘: pro_downstream.write(‘>%s\n%s\n‘ % (key, record_dict[value[0]][int(value[1]) - 999:int(value[2]) + 2000].seq.reverse_complement()))# get CDS seuqnecefor key, value in CDS.items(): sequence = [] for i in value: if i[-1] == ‘+‘: sequence.append(record_dict[i[0]][int(i[1])-1:int(i[2])].seq) CDS_sequence.write(‘>%s\n%s\n‘ % (key, sequence)) for i in value.reverse: if i[-1] == ‘-‘: sequence.append(record_dict[i[0]][int(i[1]) - 1:int(i[2])].seq.reverse_complement()) CDS_sequence.write(‘>%s\n%s\n‘ % (key, sequence))# get exon sequencefor key, value in exon.items(): sequence = [] for i in value: if i[-1] == ‘+‘: sequence.append(record_dict[i[0]][int(i[1])-1:int(i[2])].seq) exon_sequence.write(‘>%s\n%s\n‘ % (key, sequence)) for i in value.reverse: if i[-1] == ‘-‘: sequence.append(record_dict[i[0]][int(i[1]) - 1:int(i[2])].seq.reverse_complement()) exon_sequence.write(‘>%s\n%s\n‘ % (key, sequence))gene_sequence.close()mRNA_sequence.close()CDS_sequence.close()pro_downstream.close()
从gff3文件中获取fasta文件
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。