首页 > 代码库 > 从gff3文件中获取fasta文件

从gff3文件中获取fasta文件

chr1A	NRGenome	gene	1157233	1158291	.	+	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.path1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096860.1chr1A	NRGenome	mRNA	1157233	1158291	.	+	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096860.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.path1;coverage=100.0;identity=100.0;matches=1059;mismatches=0;indels=0;unknowns=0chr1A	NRGenome	exon	1157233	1158291	100	+	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1.exon1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096860.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096860.1 1 1059 +chr1A	NRGenome	CDS	1157233	1158291	100	+	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1.cds1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096860.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096860.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096860.1 1 1059 +chr1A	NRGenome	gene	1159521	1162591	.	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.path1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1chr1A	NRGenome	mRNA	1159521	1162591	.	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.path1;coverage=100.0;identity=100.0;matches=1527;mismatches=0;indels=0;unknowns=0chr1A	NRGenome	exon	1162250	1162591	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1 342 +chr1A	NRGenome	exon	1161953	1162150	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon2;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 343 540 +chr1A	NRGenome	exon	1161682	1161859	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon3;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 541 718 +chr1A	NRGenome	exon	1161377	1161547	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon4;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 719 889 +chr1A	NRGenome	exon	1160679	1160710	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon5;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 890 921 +chr1A	NRGenome	exon	1160535	1160577	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon6;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 922 964 +chr1A	NRGenome	exon	1160392	1160459	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon7;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 965 1032 +chr1A	NRGenome	exon	1160086	1160127	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon8;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1033 1074 +chr1A	NRGenome	exon	1159521	1159973	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.exon9;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1075 1527 +chr1A	NRGenome	CDS	1162250	1162591	100	-	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1 342 +chr1A	NRGenome	CDS	1161953	1162150	100	-	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds2;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 343 540 +chr1A	NRGenome	CDS	1161682	1161859	100	-	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds3;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 541 718 +chr1A	NRGenome	CDS	1161377	1161547	100	-	1	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds4;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 719 889 +chr1A	NRGenome	CDS	1160679	1160710	100	-	1	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds5;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 890 921 +chr1A	NRGenome	CDS	1160535	1160577	100	-	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds6;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 922 964 +chr1A	NRGenome	CDS	1160392	1160459	100	-	1	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds7;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 965 1032 +chr1A	NRGenome	CDS	1160086	1160127	100	-	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds8;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1033 1074 +chr1A	NRGenome	CDS	1159521	1159973	100	-	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1.cds9;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.1;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.1.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.1 1075 1527 +chr1A	NRGenome	gene	1159521	1162591	.	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.path1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3chr1A	NRGenome	mRNA	1159521	1162591	.	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.path1;coverage=100.0;identity=100.0;matches=1434;mismatches=0;indels=0;unknowns=0chr1A	NRGenome	exon	1162546	1162591	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 1 46 +chr1A	NRGenome	exon	1162250	1162452	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon2;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 47 249 +chr1A	NRGenome	exon	1161953	1162150	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon3;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 250 447 +chr1A	NRGenome	exon	1161682	1161859	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon4;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 448 625 +chr1A	NRGenome	exon	1161377	1161547	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon5;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 626 796 +chr1A	NRGenome	exon	1160679	1160710	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon6;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 797 828 +chr1A	NRGenome	exon	1160535	1160577	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon7;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 829 871 +chr1A	NRGenome	exon	1160392	1160459	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon8;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 872 939 +chr1A	NRGenome	exon	1160086	1160127	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon9;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 940 981 +chr1A	NRGenome	exon	1159521	1159973	100	-	.	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.exon10;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 982 1434 +chr1A	NRGenome	CDS	1162546	1162591	100	-	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds1;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 1 46 +chr1A	NRGenome	CDS	1162250	1162452	100	-	1	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds2;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 47 249 +chr1A	NRGenome	CDS	1161953	1162150	100	-	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds3;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 250 447 +chr1A	NRGenome	CDS	1161682	1161859	100	-	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds4;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 448 625 +chr1A	NRGenome	CDS	1161377	1161547	100	-	1	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds5;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 626 796 +chr1A	NRGenome	CDS	1160679	1160710	100	-	1	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds6;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 797 828 +chr1A	NRGenome	CDS	1160535	1160577	100	-	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds7;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 829 871 +chr1A	NRGenome	CDS	1160392	1160459	100	-	1	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds8;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 872 939 +chr1A	NRGenome	CDS	1160086	1160127	100	-	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds9;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 940 981 +chr1A	NRGenome	CDS	1159521	1159973	100	-	0	ID=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1.cds10;Name=TRIAE_CS42_U_TGACv1_641506_AA2096870.3;Parent=TRIAE_CS42_U_TGACv1_641506_AA2096870.3.mrna1;Target=TRIAE_CS42_U_TGACv1_641506_AA2096870.3 982 1434 +

python代码

#!/usr/bin/env python# -*- coding: utf-8 -*-from Bio import SeqIO# fasta = open("/data2/masw_data/seqdb/chr1A.fasta", "rU")record_dict = SeqIO.index("/data2/masw_data/seqdb/chr1A.fasta", "fasta")gene_sequence = open(‘gene.fasta‘, ‘w‘)mRNA_sequence = open(‘mRNA.fasta‘, ‘w‘)exon_sequence = open(‘exon.fasta‘, ‘w‘)CDS_sequence = open(‘CDS.fasta‘,‘w‘)pro_downstream = open(‘pro_and_downstream.fasta‘, ‘w‘)gene = {}mRNA = {}exon = {}CDS = {}with open(‘1.txt‘, ‘r‘) as f:    for line in f:        line1 = line.strip().split()        chr = line1[0]        feature = line1[2]        start = line1[3]        end = line1[4]        direction = line1[6]        name = line1[8].split(";")[1][5:]        if feature == ‘gene‘:                gene[name] = (chr, start, end, direction)        if line1[2] == ‘mRNA‘:                mRNA[name] = (chr, start, end, direction)        if line1[2] == ‘exon‘:            if exon.get(name, None):                exon[name].extend([(chr, start, end, direction)])            else:                exon[name] = [(chr, start, end, direction)]        if line1[2] == ‘CDS‘:            if CDS.get(name, None):                CDS[name].extend([(chr, start, end, direction)])            else:                CDS[name] = [(chr, start, end, direction)]# get gene sequence include intronsfor key, value in gene.items():    if value[3] == ‘+‘:        gene_sequence.write(‘>%s\n%s\n‘ % (key, record_dict[value[0]][int(value[1])-1:int(value[2])].seq))    if value[3] == ‘-‘:        gene_sequence.write(‘>%s\n%s\n‘ % (key, record_dict[value[0]][int(value[1]) - 1:int(value[2])].seq.reverse_complement()))# get mRNA sequence include intronsfor key, value in mRNA.items():    if value[3] == ‘+‘:        mRNA_sequence.write(‘>%s\n%s\n‘ % (key, record_dict[value[0]][int(value[1])-1:int(value[2])].seq))    if value[3] == ‘-‘:        mRNA_sequence.write(‘>%s\n%s\n‘ % (key, record_dict[value[0]][int(value[1]) - 1:int(value[2])].seq.reverse_complement()))# get 2k upstream,1k downstream and gene sequencefor key, value in gene.items():    if value[3] == ‘+‘:        pro_downstream.write(‘>%s\n%s\n‘ % (key, record_dict[value[0]][int(value[1])-2001:int(value[2]) + 1000].seq))    if value[3] == ‘-‘:        pro_downstream.write(‘>%s\n%s\n‘ % (key, record_dict[value[0]][int(value[1]) - 999:int(value[2]) + 2000].seq.reverse_complement()))# get CDS seuqnecefor key, value in CDS.items():    sequence = []    for i in value:        if i[-1] == ‘+‘:            sequence.append(record_dict[i[0]][int(i[1])-1:int(i[2])].seq)        CDS_sequence.write(‘>%s\n%s\n‘ % (key, sequence))    for i in value.reverse:        if i[-1] == ‘-‘:            sequence.append(record_dict[i[0]][int(i[1]) - 1:int(i[2])].seq.reverse_complement())        CDS_sequence.write(‘>%s\n%s\n‘ % (key, sequence))# get exon sequencefor key, value in exon.items():    sequence = []    for i in value:        if i[-1] == ‘+‘:            sequence.append(record_dict[i[0]][int(i[1])-1:int(i[2])].seq)        exon_sequence.write(‘>%s\n%s\n‘ % (key, sequence))    for i in value.reverse:        if i[-1] == ‘-‘:            sequence.append(record_dict[i[0]][int(i[1]) - 1:int(i[2])].seq.reverse_complement())        exon_sequence.write(‘>%s\n%s\n‘ % (key, sequence))gene_sequence.close()mRNA_sequence.close()CDS_sequence.close()pro_downstream.close()

  

从gff3文件中获取fasta文件