vcf 模块

首页 > 代码库 > vcf 模块

2024-07-18 06:16:36 220人阅读

最近一直在处理samtools freebayes gatk 产生的snp数据，结果文件都是vcf，于是自己就写了相应的类，但是总是不够完善。海宝推荐这个模块，他都推荐了我还抱着我那烂代码不放干啥之前写的就当练习类了

安装：

sudo pip install pyvcf

然后报错说没有counter模块，于是：

sudo pip install counter

然后就安装好了

简单实用：

import vcf
myvcf = vcf.Reader(open(‘testpyvcf‘, ‘r‘)) #和python内置的文件类型一样，循环完不会从头开始。
for i in myvcf:
print i

Record(CHROM=Chr1, POS=11553, REF=G, ALT=[C])
Record(CHROM=Chr1, POS=12840, REF=T, ALT=[G])
Record(CHROM=Chr1, POS=16188, REF=GAAAAAAAG, ALT=[GAAAAAAAAG])
Record(CHROM=Chr1, POS=18915, REF=CAAAAAAG, ALT=[CAAAAAAAG])
Record(CHROM=Chr1, POS=19439, REF=CTTTTTTTTTA, ALT=[CTTTTTTTTTTTA])
Record(CHROM=Chr1, POS=24810, REF=ATTTTTTTTTC, ALT=[ATTTTTTTTTTC])
Record(CHROM=Chr1, POS=26067, REF=CAAAAAAG, ALT=[CAAAAAAAG])
Record(CHROM=Chr1, POS=26996, REF=CAAAAAAAAT, ALT=[CAAAAAAAAAAT])
Record(CHROM=Chr1, POS=27142, REF=C, ALT=[G])
Record(CHROM=Chr1, POS=27698, REF=CTTTTTTTTC, ALT=[CTTTTTTTTTC])
Record(CHROM=Chr1, POS=30645, REF=A, ALT=[C])
Record(CHROM=Chr1, POS=31478, REF=C, ALT=[T])
Record(CHROM=Chr1, POS=33667, REF=A, ALT=[G])
Record(CHROM=Chr1, POS=34057, REF=C, ALT=[T])
Record(CHROM=Chr1, POS=34339, REF=TAAAAAAAAAC, ALT=[TAAAAAAAAAAC])
Record(CHROM=Chr1, POS=35604, REF=T, ALT=[G])

当然可以直接取你所需

print i.CHROM #对应vcf chr那一列

print i.POS #对应vcf pos那一列返回的是整型

print i.ID #对应ID 无得话返回None

print i.REF #对应ref列返回的是一个字符串

print i.ALT #对应alt列注意返回的是一个列表

print i.QUAL #对应qual列返回的是float

print i.FILTER #对应filter 列无的话返回None

print i.INFO #对应vcf文件INFO列返回的是一个字典注意字典的值有的是列表，有的是字符串，有的是int, 有的是float

eg:

{‘SAP‘: [5.1817700000000002], ‘EPP‘: [5.1817700000000002], ‘SRR‘: 5, ‘DPB‘: 25.0, ‘MQMR‘: 50.0, ‘DP‘: 25, ‘PAO‘: [0.0], ‘RPP‘: [11.696199999999999], ‘PAIRED‘: [0.75], ‘ODDS‘: 3.4715400000000001, ‘MEANALT‘: [1.0], ‘MQM‘: [50.0], ‘SAF‘: [3], ‘PAIREDR‘: 0.90476199999999996, ‘EPPR‘: 11.385999999999999, ‘SAR‘: [1], ‘NS‘: 3, ‘RO‘: 21, ‘AC‘: [2], ‘AB‘: [0.20000000000000001], ‘SRF‘: 16, ‘AF‘: [0.33333299999999999], ‘GTI‘: 0, ‘AO‘: [4], ‘AN‘: 6, ‘ABP‘: [18.6449], ‘SRP‘: 15.5221, ‘DPRA‘: [2.0], ‘RPPR‘: 15.5221, ‘PQA‘: [0.0], ‘QR‘: 656, ‘RUN‘: [1], ‘CIGAR‘: [‘1X‘], ‘LEN‘: [1], ‘NUMALT‘: 1, ‘QA‘: [126], ‘PQR‘: 0.0, ‘TYPE‘: [‘snp‘], ‘PRO‘: 0.0}

可以通过相应的键取出对应的值如：

print i.INFO[‘TYPE’] 返回列表

print i.INFO[‘DP’] 返回的是int

i.FORMAT #返回format列字符串如果你的vcf文件中没有FORMAT 返回None

eg:

print i.FORMAT, type(i.FORMAT)

GT:DP:RO:QR:AO:QA:GL <type ‘str‘>
GT:DP:RO:QR:AO:QA:GL <type ‘str‘>
GT:DP:RO:QR:AO:QA:GL <type ‘str‘>
GT:DP:RO:QR:AO:QA:GL <type ‘str‘>
GT:DP:RO:QR:AO:QA:GL <type ‘str‘>
GT:DP:RO:QR:AO:QA:GL <type ‘str‘>
GT:DP:RO:QR:AO:QA:GL <type ‘str‘>
GT:DP:RO:QR:AO:QA:GL <type ‘str‘>
GT:DP:RO:QR:AO:QA:GL <type ‘str‘>
GT:DP:RO:QR:AO:QA:GL <type ‘str‘>
GT:DP:RO:QR:AO:QA:GL <type ‘str‘>
GT:DP:RO:QR:AO:QA:GL <type ‘str‘>
GT:DP:RO:QR:AO:QA:GL <type ‘str‘>
GT:DP:RO:QR:AO:QA:GL <type ‘str‘>
GT:DP:RO:QR:AO:QA:GL <type ‘str‘>
GT:DP:RO:QR:AO:QA:GL <type ‘str‘>

i.samples 和 i.genotype #其他都是大写这两个是小写

这两个不对应哪一列

eg:

print i.samples # 我的vcf文件有三个样品分别是12 CS48 F12 返回的是三个样 call object 组成的列表。

[Call(sample=12, CallData(GT=0/1, DP=12, RO=10, QR=300, AO=2, QA=64, GL=[-4.2871800000000002, 0.0, -10.0])), Call(sample=CS48, CallData(GT=0/1, DP=8, RO=6, QR=199, AO=2, QA=62, GL=[-4.9289199999999997, 0.0, -10.0])), Call(sample=F12, CallData(GT=0/0, DP=5, RO=5, QR=157, AO=0, QA=0, GL=[0.0, -1.50515, -10.0]))]

如何取出每个样中心信息？

for i in myvcf: #返回record

for j in i.samples: #返回每个sample

print j[‘GT’] #每个sample的genotype 当然也可以j[‘DP’], j[‘RO’], j[‘AO’]…

1/1   #第一个样本的genotype   GT返回字符串 AO RO DP 返回int
0/1    #第二个样本的genotype
0/1    #第三个样本的genotype

如果某个样没有信息返回None

i.genotype的用法：

eg:

for i in myvcf:
print i.genotype(‘12‘)[‘GT‘] #和i.samples不太一样的是你必须知道你要取哪个样的信息，因为你必须给genotype传一个样本参数。

i.genotype返回的是call对象

每个call对象有三个属性。 site, sample, data

eg:

for i in myvcf:
    call = i.genotype(‘12‘) #返回call对象
    print call.site         #返回call对象的chrom pos refbase altbase 信息
    print call.sample      #返回样本名字
    print call.data       #返回call data

Record(CHROM=Chr1, POS=35604, REF=T, ALT=[G]) #可以讲对应项取出来
12 #样本的名字
CallData(GT=0/1, DP=12, RO=10, QR=300, AO=2, QA=64, GL=[-4.2871800000000002, 0.0, -10.0]) #可以将对应项取出来

除以上的方法外还有些方便的方法直接使用，如

i.is_snp

i.is_indel

i.is_transition

i.is_deletion

i.is_monomorphic

上面的几位都返回bool

i.var_type #如snp,,,,

i.var_subtype #如ts

>>> myvcf = vcf.Reader(open(‘testpyvcf‘, ‘r‘))
>>> myvcf.metadata
OrderedDict([(‘fileformat‘, ‘VCFv4.1‘), (‘fileDate‘, ‘20140817‘), (‘source‘, [‘freeBayes v0.9.14-15-gc6f49c0‘]), (‘reference‘, ‘../../call_snp/refseq/Osativa_204.fa‘), (‘phasing‘, [‘none‘]), (‘commandline‘, [‘"freebayes -f ../../call_snp/refseq/Osativa_204.fa -u -X -L bamfile.fb.list"‘])])

>>> myvcf.samples
[‘12‘, ‘CS48‘, ‘F12‘]

还有myvcf.infos myvcf.filters myvcf.formats

>>> myvcf.infos
OrderedDict([(‘NS‘, Info(id=‘NS‘, num=1, type=‘Integer‘, desc=‘Number of samples with data‘)), (‘DP‘, Info(id=‘DP‘, num=1, type=‘Integer‘, desc=‘Total read depth at the locus‘)), (‘DPB‘, Info(id=‘DPB‘, num=1, type=‘Float‘, desc=‘Total read depth per bp at the locus; bases in reads overlapping / bases in haplotype‘)), (‘AC‘, Info(id=‘AC‘, num=-1, type=‘Integer‘, desc=‘Total number of alternate alleles in called genotypes‘)), (‘AN‘, Info(id=‘AN‘, num=1, type=‘Integer‘, desc=‘Total number of alleles in called genotypes‘)), (‘AF‘, Info(id=‘AF‘, num=-1, type=‘Float‘, desc=‘Estimated allele frequency in the range (0,1]‘)), (‘RO‘, Info(id=‘RO‘, num=1, type=‘Integer‘, desc=‘Reference allele observation count, with partial observations recorded fractionally‘)), (‘AO‘, Info(id=‘AO‘, num=-1, type=‘Integer‘, desc=‘Alternate allele observations, with partial observations recorded fractionally‘)), (‘PRO‘, Info(id=‘PRO‘, num=1, type=‘Float‘, desc=‘Reference allele observation count, with partial observations recorded fractionally‘)), (‘PAO‘, Info(id=‘PAO‘, num=-1, type=‘Float‘, desc=‘Alternate allele observations, with partial observations recorded fractionally‘)), (‘QR‘, Info(id=‘QR‘, num=1, type=‘Integer‘, desc=‘Reference allele quality sum in phred‘)), (‘QA‘, Info(id=‘QA‘, num=-1, type=‘Integer‘, desc=‘Alternate allele quality sum in phred‘)), (‘PQR‘, Info(id=‘PQR‘, num=1, type=‘Float‘, desc=‘Reference allele quality sum in phred for partial observations‘)), (‘PQA‘, Info(id=‘PQA‘, num=-1, type=‘Float‘, desc=‘Alternate allele quality sum in phred for partial observations‘)), (‘SRF‘, Info(id=‘SRF‘, num=1, type=‘Integer‘, desc=‘Number of reference observations on the forward strand‘)), (‘SRR‘, Info(id=‘SRR‘, num=1, type=‘Integer‘, desc=‘Number of reference observations on the reverse strand‘)), (‘SAF‘, Info(id=‘SAF‘, num=-1, type=‘Integer‘, desc=‘Number of alternate observations on the forward strand‘)), (‘SAR‘, Info(id=‘SAR‘, num=-1, type=‘Integer‘, desc=‘Number of alternate observations on the reverse strand‘)), (‘SRP‘, Info(id=‘SRP‘, num=1, type=‘Float‘, desc="Strand balance probability for the reference allele: Phred-scaled upper-bounds estimate of the probability of observing the deviation between SRF and SRR given E(SRF/SRR) ~ 0.5, derived using Hoeffding‘s inequality")), (‘SAP‘, Info(id=‘SAP‘, num=-1, type=‘Float‘, desc="Strand balance probability for the alternate allele: Phred-scaled upper-bounds estimate of the probability of observing the deviation between SAF and SAR given E(SAF/SAR) ~ 0.5, derived using Hoeffding‘s inequality")), (‘AB‘, Info(id=‘AB‘, num=-1, type=‘Float‘, desc=‘Allele balance at heterozygous sites: a number between 0 and 1 representing the ratio of reads showing the reference allele to all reads, considering only reads from individuals called as heterozygous‘)), (‘ABP‘, Info(id=‘ABP‘, num=-1, type=‘Float‘, desc="Allele balance probability at heterozygous sites: Phred-scaled upper-bounds estimate of the probability of observing the deviation between ABR and ABA given E(ABR/ABA) ~ 0.5, derived using Hoeffding‘s inequality")), (‘RUN‘, Info(id=‘RUN‘, num=-1, type=‘Integer‘, desc=‘Run length: the number of consecutive repeats of the alternate allele in the reference genome‘)), (‘RPP‘, Info(id=‘RPP‘, num=-1, type=‘Float‘, desc="Read Placement Probability: Phred-scaled upper-bounds estimate of the probability of observing the deviation between RPL and RPR given E(RPL/RPR) ~ 0.5, derived using Hoeffding‘s inequality")), (‘RPPR‘, Info(id=‘RPPR‘, num=1, type=‘Float‘, desc="Read Placement Probability for reference observations: Phred-scaled upper-bounds estimate of the probability of observing the deviation between RPL and RPR given E(RPL/RPR) ~ 0.5, derived using Hoeffding‘s inequality")), (‘EPP‘, Info(id=‘EPP‘, num=-1, type=‘Float‘, desc="End Placement Probability: Phred-scaled upper-bounds estimate of the probability of observing the deviation between EL and ER given E(EL/ER) ~ 0.5, derived using Hoeffding‘s inequality")), (‘EPPR‘, Info(id=‘EPPR‘, num=1, type=‘Float‘, desc="End Placement Probability for reference observations: Phred-scaled upper-bounds estimate of the probability of observing the deviation between EL and ER given E(EL/ER) ~ 0.5, derived using Hoeffding‘s inequality")), (‘DPRA‘, Info(id=‘DPRA‘, num=-1, type=‘Float‘, desc=‘Alternate allele depth ratio. Ratio between depth in samples with each called alternate allele and those without.‘)), (‘ODDS‘, Info(id=‘ODDS‘, num=1, type=‘Float‘, desc=‘The log odds ratio of the best genotype combination to the second-best.‘)), (‘GTI‘, Info(id=‘GTI‘, num=1, type=‘Integer‘, desc=‘Number of genotyping iterations required to reach convergence or bailout.‘)), (‘TYPE‘, Info(id=‘TYPE‘, num=-1, type=‘String‘, desc=‘The type of allele, either snp, mnp, ins, del, or complex.‘)), (‘CIGAR‘, Info(id=‘CIGAR‘, num=-1, type=‘String‘, desc="The extended CIGAR representation of each alternate allele, with the exception that ‘=‘ is replaced by ‘M‘ to ease VCF parsing. Note that INDEL alleles do not have the first matched base (which is provided by default, per the spec) referred to by the CIGAR.")), (‘NUMALT‘, Info(id=‘NUMALT‘, num=1, type=‘Integer‘, desc=‘Number of unique non-reference alleles in called genotypes at this position.‘)), (‘MEANALT‘, Info(id=‘MEANALT‘, num=-1, type=‘Float‘, desc=‘Mean number of unique non-reference allele observations per sample with the corresponding alternate alleles.‘)), (‘LEN‘, Info(id=‘LEN‘, num=-1, type=‘Integer‘, desc=‘allele length‘)), (‘MQM‘, Info(id=‘MQM‘, num=-1, type=‘Float‘, desc=‘Mean mapping quality of observed alternate alleles‘)), (‘MQMR‘, Info(id=‘MQMR‘, num=1, type=‘Float‘, desc=‘Mean mapping quality of observed reference alleles‘)), (‘PAIRED‘, Info(id=‘PAIRED‘, num=-1, type=‘Float‘, desc=‘Proportion of observed alternate alleles which are supported by properly paired read fragments‘)), (‘PAIREDR‘, Info(id=‘PAIREDR‘, num=1, type=‘Float‘, desc=‘Proportion of observed reference alleles which are supported by properly paired read fragments‘))])

如果想看某一个缩写的描述而不是全部可以：

>>> myvcf.infos[‘DP‘].desc
‘Total read depth at the locus‘

>>> myvcf.infos[‘RO‘].desc
‘Reference allele observation count, with partial observations recorded fractionally‘
>>> myvcf.infos[‘AO‘].desc
‘Alternate allele observations, with partial observations recorded fractionally‘

如果你不想从头到尾循环某个文件。只想取某一部分，可以用fetch, 但是前体是用tabix对文件index， tabix前要用bgzip压缩

bgzip testpyvcf.vcf #得到testpyvcf.vcf.gz文件

tabix -p vcf testpyvcf.vcf.gz #得到testpyvcf.vcf.gz的index文件：testpyvcf.vcf.gz.tbi

>>> import vcf
>>> myvcf = vcf.Reader(filename=‘testpyvcf.vcf.gz‘)

>>> for i in myvcf.fetch(‘Chr1‘, 1111, 444444): #第一个是序列名，第二个起始，第三个end,包括。加入vcf文件中有两个位点100 和 200 如果（100,200），会返回这两个record。... print i
...
Record(CHROM=Chr1, POS=11553, REF=G, ALT=[C])
Record(CHROM=Chr1, POS=12840, REF=T, ALT=[G])
Record(CHROM=Chr1, POS=16188, REF=GAAAAAAAG, ALT=[GAAAAAAAAG])
Record(CHROM=Chr1, POS=18915, REF=CAAAAAAG, ALT=[CAAAAAAAG])
Record(CHROM=Chr1, POS=19439, REF=CTTTTTTTTTA, ALT=[CTTTTTTTTTTTA])
Record(CHROM=Chr1, POS=24810, REF=ATTTTTTTTTC, ALT=[ATTTTTTTTTTC])
Record(CHROM=Chr1, POS=26067, REF=CAAAAAAG, ALT=[CAAAAAAAG])
Record(CHROM=Chr1, POS=26996, REF=CAAAAAAAAT, ALT=[CAAAAAAAAAAT])
Record(CHROM=Chr1, POS=27142, REF=C, ALT=[G])
Record(CHROM=Chr1, POS=27698, REF=CTTTTTTTTC, ALT=[CTTTTTTTTTC])
Record(CHROM=Chr1, POS=30645, REF=A, ALT=[C])
Record(CHROM=Chr1, POS=31478, REF=C, ALT=[T])
Record(CHROM=Chr1, POS=33667, REF=A, ALT=[G])
Record(CHROM=Chr1, POS=34057, REF=C, ALT=[T])
Record(CHROM=Chr1, POS=34339, REF=TAAAAAAAAAC, ALT=[TAAAAAAAAAAC])
Record(CHROM=Chr1, POS=35604, REF=T, ALT=[G])

myvcf = vcf.Reader(filename=‘testpyvcf.vcf.gz‘)
for i in myvcf.fetch(‘Chr1‘,34339): #只提供一个点，而不是区间，会返回这个点出样本的call对象。
print i

Call(sample=12, CallData(GT=0/1, DP=47, RO=43, QR=1501, AO=4, QA=127, GL=[-2.8504, 0.0, -10.0]))
Call(sample=CS48, CallData(GT=1/1, DP=82, RO=4, QR=146, AO=68, QA=2316, GL=[-10.0, -2.51126, 0.0]))
Call(sample=F12, CallData(GT=0/1, DP=27, RO=22, QR=785, AO=3, QA=95, GL=[-4.4559800000000003, 0.0, -10.0]))

by freemao

FAFU.

free_mao@qq.com

声明：以上内容来自用户投稿及互联网公开渠道收集整理发布，本网站不拥有所有权，未作人工编辑处理，也不承担相关法律责任，若内容有误或涉及侵权可进行投诉：投诉/举报工作人员会在5个工作日内联系你，一经查实，本站将立刻删除涉嫌侵权内容。

联系
我们

首页 > 代码库 > vcf 模块

vcf 模块

看完仍有疑问？有类似问题直接问程序猿