Difference between revisions of "Tips kang"
(27 intermediate revisions by one user not shown) | |||
Line 1: | Line 1: | ||
+ | '''job finish notify''' | ||
+ | |||
+ | mpack -s ${PWD}:done log kangyangjae@gmail.com | ||
+ | |||
+ | '''G.max genes for speciation''' | ||
+ | |||
+ | GMAA|Glyma14g07440.1 Glyma.14G068500.Wm82.a2.v1 | ||
+ | |||
+ | GMAA|Glyma17g05020.2 Glyma.17G045400.Wm82.a2.v1 | ||
+ | |||
+ | GMAA|Glyma17g05030.1 Glyma.17G045500.Wm82.a2.v1 | ||
+ | |||
+ | GMAA|Glyma20g37700.1 Glyma.20G233300.Wm82.a2.v1 | ||
+ | |||
+ | GMAA|Glyma01g36870.1 Glyma.01G164100.Wm82.a2.v1 | ||
+ | |||
+ | GMAA|Glyma05g04860.1 Glyma.05G061300.Wm82.a2.v1 | ||
+ | |||
+ | GMAA|Glyma07g00730.1 Glyma.07G005200.Wm82.a2.v1 | ||
+ | |||
+ | GMAA|Glyma11g36980.3 Glyma.11G251500.Wm82.a2.v1 | ||
+ | |||
+ | GMAA|Glyma12g04760.3 Glyma.12G043400.Wm82.a2.v1 | ||
+ | |||
+ | GMAB|Glyma02g41510.2 Glyma.02G247800.Wm82.a2.v1 | ||
+ | |||
+ | GMAB|Glyma13g17490.1 Glyma.13G114600.Wm82.a2.v1 | ||
+ | |||
+ | GMAB|Glyma13g17450.1 Glyma.13G114300.Wm82.a2.v1 | ||
+ | |||
+ | GMAB|Glyma10g29600.1 Glyma.10G155000.Wm82.a2.v1 | ||
+ | |||
+ | GMAB|Glyma11g08390.1 Glyma.11G079200.Wm82.a2.v1 | ||
+ | |||
+ | GMAB|Glyma17g15220.2 Glyma.17G143100.Wm82.a2.v1 | ||
+ | |||
+ | GMAB|Glyma08g21980.2 Glyma.08G205700.Wm82.a2.v1 | ||
+ | |||
+ | GMAB|Glyma18g00903.1 Glyma.18G005600.Wm82.a2.v1 | ||
+ | |||
+ | GMAB|Glyma11g12550.3 Glyma.11G118000.Wm82.a2.v1 | ||
+ | |||
+ | '''moleculo trim''' | ||
+ | |||
+ | --5' trim-- | ||
+ | zcat TN1408D2532barcode1_LongRead.fastq.gz | perl ./prinseq-lite-0.20.4/prinseq-lite.pl -fastq stdin -out_format 1 -min_len 3000 -min_qual_mean 20 -trim_qual_left 30 | ||
+ | |||
+ | '''regular expression''' | ||
+ | |||
+ | arabidopsis genename : bool(re.match('AT[0-9]G[0-9]{5}',x.upper()) | ||
+ | |||
+ | '''peak annotation''' | ||
+ | |||
+ | > head(data) | ||
+ | V1 V2 | ||
+ | 1 Aly 0.2766870 | ||
+ | 2 Aly 0.0920515 | ||
+ | 3 Aly 0.5407250 | ||
+ | 4 Aly 0.6232970 | ||
+ | 5 Aly 0.6338910 | ||
+ | 6 Aly 0.9712740 | ||
+ | |||
+ | library(ggplot2) | ||
+ | library(grid) | ||
+ | library(plyr) | ||
+ | densMode <- function(x){ | ||
+ | td <- density(x) | ||
+ | maxDens <- which.max(td$y) | ||
+ | list(x=td$x[maxDens], y=td$y[maxDens]) | ||
+ | } | ||
+ | xdat <- ddply(data,"V1",transform,val_mean=signif(densMode(V2)$x,3), med.x = signif(densMode(V2)$x,3), med.y=signif(densMode(V2)$y,3)) | ||
+ | hp <- ggplot(data=data, aes(x=V2)) + geom_density() + geom_vline(aes(xintercept=val_mean),xdat, color="red",linetype="dashed",size=1) + theme_bw() | ||
+ | hp<- hp + | ||
+ | facet_grid(V1 ~ .) + | ||
+ | geom_text(data = xdat, aes(x=med.x,y=med.y,label=val_mean,size=10)) + xlim(0,1) | ||
+ | hp | ||
+ | |||
+ | '''Change the orders''' | ||
+ | |||
+ | data$V1 <- factor(data$V1,levels = c("Tca", "Gra", "Ptr", "Ccl", "Egr", "Fve", "Mtr", "Vvi", "Sly", "Ath", "Aly", "Cru", "Bra", "Tha")) | ||
+ | |||
+ | '''R square for linear models''' | ||
+ | data<-read.csv("") | ||
+ | test<- lm(V1,V2, data) # V1 엔 x값 V2엔 y값 | ||
+ | summary(test) | ||
+ | |||
+ | |||
+ | '''SNP density''' | ||
+ | |||
+ | > head(data) | ||
+ | V1 V2 V3 V4 V5 | ||
+ | 1 Gm01 1845338 102 10 1/1 | ||
+ | 2 Gm01 2043826 127 18 1/1 | ||
+ | 3 Gm01 2043846 221 22 1/1 | ||
+ | 4 Gm01 2043914 124 16 1/1 | ||
+ | 5 Gm01 2043998 133 13 1/1 | ||
+ | 6 Gm01 2044202 191 18 1/1 | ||
+ | ggplot(data, aes(x=V2,fill=V1)) + geom_histogram() + facet_wrap(~ V1) | ||
+ | |||
+ | |||
+ | '''msort''' | ||
+ | msort -k 1,n2 S_1_4_5_1.fastq.gz_bismark_bt2_pe.bismark.cov > S_1_4_5_1.fastq.gz_bismark_bt2_pe.bismark.cov.sort | ||
+ | 2번 숫자로 정렬 이후 1번 정렬 | ||
+ | |||
+ | |||
+ | '''ggplot examples''' | ||
+ | http://www.nature.com/psp/journal/v2/n10/full/psp201356a.html | ||
+ | |||
+ | '''synteny chromosome plot''' | ||
+ | python3 /data2/k821209/good_python_code/synteny_plot_parse.py Pv2Va.collinearity.kaks Pv2Va Pv2Va.gff | ||
+ | python3 synteny_plot_parse.py Va2Gm.collinearity.kaks Va2Gm Va2Gm.gff | ||
+ | cat Pv2Va.collinearity.kaks.Rin Va2Gm.collinearity.kaks.Rin Va2Vr.collinearity.kaks.Rin > merge.Rin | ||
+ | |||
+ | R | ||
+ | data<-read.table('merge.Rin',header=F,sep='\t') | ||
+ | sub<-subset(data,!grepl("s",data$V3)) # remove scaffolds | ||
+ | sub<-subset(sub,!grepl("SS",sub$V3)) # remove super scaffolds | ||
+ | png('all.png', height=600, width=1240, res = 100, units = 'px') | ||
+ | print(ggplot(sub, aes(x=V4,y=V7,color=factor(V5))) + geom_point(alpha=0.5,size=1) + facet_grid(V3 ~ V1) + ylim(0,2)) | ||
+ | dev.off() | ||
+ | |||
+ | |||
+ | '''gpu blast''' | ||
+ | |||
+ | Orthomcl 용 | ||
+ | /data/k821209/programs/gpu-blast-1.1_ncbi-blast-2.2.26/ncbi-blast-2.2.26+-src/c++/GCC460-ReleaseMT64/bin/makeblastdb -in goodProteins.fasta -out goodProteins.fasta.sorted -sort_volumes | ||
+ | /data/k821209/programs/gpu-blast-1.1_ncbi-blast-2.2.26/ncbi-blast-2.2.26+-src/c++/GCC460-ReleaseMT64/bin/blastp -query ./goodProteins.fasta -db ./goodProteins.fasta.sorted -gpu t -method 2 | ||
+ | /data/k821209/programs/gpu-blast-1.1_ncbi-blast-2.2.26/ncbi-blast-2.2.26+-src/c++/GCC460-ReleaseMT64/bin/blastp -query ./goodProteins.fasta -dbsize 101358393 -evalue 1e-5 -out self.out -outfmt 6 -seg yes -db ./goodProteins.fasta.sorted -gpu t | ||
+ | /data/k821209/programs/gpu-blast-1.1_ncbi-blast-2.2.26/ncbi-blast-2.2.26+-src/c++/GCC460-ReleaseMT64/bin/blastp -query ./adzuki.ver3.pep.fa -max_target_seqs 1 -evalue 1e-5 -out Va2Vn.bp.1e5.out7 -outfmt 6 -db ./ysp84_q20-scaffolds.fa.over10k.farenamed.fa.pep.fa.sorted -num_threads 2 -gpu t | ||
+ | |||
+ | |||
+ | '''Gene length density plot''' | ||
+ | python3 length_distribution.py Gmax_189_gene.gff3 G.max | ||
+ | python3 length_distribution.py Vradi_ver6.gff.sort.gff V.radiata | ||
+ | python3 length_distribution.py adzuki.ref.all.gff.maker.gff V.angularis | ||
+ | python3 length_distribution.py Pvulgaris_218_v1.0.gene.gff P.vulgaris | ||
+ | cat *.ld.out > ld.out.Rin | ||
+ | R | ||
+ | data<-read.table('ld.out.Rin',header=F) | ||
+ | sub <- subset(data, V2=="CDS" | V2=="gene" | V2=="intron") | ||
+ | ggplot(sub,aes(x=V3,color=V1)) + geom_density() + facet_grid(. ~ V2) + xlim(0,1000) + theme(legend.title=element_blank(),axis.text.x = element_text(angle = 90, hjust = 1)) + xlab('Length (bp)') | ||
+ | |||
+ | |||
'''TE density plot''' | '''TE density plot''' | ||
repeat masker output file | repeat masker output file | ||
Line 12: | Line 155: | ||
'''Ks density plot with peak label''' | '''Ks density plot with peak label''' | ||
+ | find ./Va2Vnip/ -name '*.kaks' | xargs cat | grep -v 'Sequence' > Va2Vnip.kaks | ||
+ | |||
sub | sub | ||
V1 V2 | V1 V2 | ||
Line 269: | Line 414: | ||
$ mv VRA.fasta ./compliantFasta/ # same for the ATH.fasta etc. | $ mv VRA.fasta ./compliantFasta/ # same for the ATH.fasta etc. | ||
$ orthomclFilterFasta ./compliantFasta/ 10 20 | $ orthomclFilterFasta ./compliantFasta/ 10 20 | ||
+ | |||
$ blastall -m 8 -a 10 -p blastp -F 'm S' -v 100000 -b 100000 -z 259701 -e 1e-5 -d goodProteins.fasta -i goodProteins.fasta -o goodProteins.fasta.self.out # 27개 할 때 보름 이상걸림 | $ blastall -m 8 -a 10 -p blastp -F 'm S' -v 100000 -b 100000 -z 259701 -e 1e-5 -d goodProteins.fasta -i goodProteins.fasta -o goodProteins.fasta.self.out # 27개 할 때 보름 이상걸림 | ||
+ | -z database size : grep -v '>' INPUT.fa | tr -d '\n' | wc | ||
+ | $ gpu:/data/k821209/programs/gpu-blast-1.1_ncbi-blast-2.2.26/ncbi-blast-2.2.26+-src/c++/GCC460-ReleaseMT64/bin/blastp -query ./goodProteins.fasta -dbsize 101358393 -evalue 1e-5 -out self.out -outfmt 6 -seg yes -db ./goodProteins.fasta.sorted -gpu t | ||
+ | |||
$ parallel -j 10 "blastall -m 8 -p blastp -F 'm S' -v 100000 -b 100000 -z 259701 -e 1e-5 -d goodProteins.fasta -i {1} -o {1}.out" ::: ???.fasta | $ parallel -j 10 "blastall -m 8 -p blastp -F 'm S' -v 100000 -b 100000 -z 259701 -e 1e-5 -d goodProteins.fasta -i {1} -o {1}.out" ::: ???.fasta | ||
$ orthomclBlastParser goodProteins.fasta.self.out compliantFasta/ >> similarSequences.txt # 27개 accession할 때 이 스텝만 6시간 정도 걸림 | $ orthomclBlastParser goodProteins.fasta.self.out compliantFasta/ >> similarSequences.txt # 27개 accession할 때 이 스텝만 6시간 정도 걸림 | ||
Line 277: | Line 426: | ||
$ mcl mclInput --abc -I 1.5 -o mclOutput | $ mcl mclInput --abc -I 1.5 -o mclOutput | ||
$ orthomclMclToGroups CLST 1000 < mclOutput > groups.txt | $ orthomclMclToGroups CLST 1000 < mclOutput > groups.txt | ||
+ | $ python3 /data2/k821209/good_python_code/orthomcl/groups_count.py | ||
R | R | ||
Line 345: | Line 495: | ||
꼼수이다. 참고로. 일단 그냥 Whole genome seqeunce fasta file을 그냥 WGS submission portal로 넣어버리고 하루를 기다린다. 뭐가 잘못되었다고 구구절절 온다. 이렇게 하라고도 구구절절온다. 그렇게 해서 보내준다. 끝. | 꼼수이다. 참고로. 일단 그냥 Whole genome seqeunce fasta file을 그냥 WGS submission portal로 넣어버리고 하루를 기다린다. 뭐가 잘못되었다고 구구절절 온다. 이렇게 하라고도 구구절절온다. 그렇게 해서 보내준다. 끝. | ||
https://submit.ncbi.nlm.nih.gov/subs/wgs/ | https://submit.ncbi.nlm.nih.gov/subs/wgs/ | ||
+ | http://www.ncbi.nlm.nih.gov/WebSub/template.cgi # template 만들기 논문정보 입력 | ||
+ | ./linux64.tbl2asn -p ./ -i OP84_q20-scaffolds.fa.over1k.fa.cor.fa.nc.fa -M n -j "[organism=Elaeis guineensis]" -Z discrep -a r10k -l paired-ends -t template.sbt # Gapped assembly submission | ||
'''Mutation rate''' | '''Mutation rate''' | ||
Line 431: | Line 583: | ||
'''추천 논문''' | '''추천 논문''' | ||
− | Paterson, A.H., M. Freeling, H. Tang and X. Wang. 2010. Insights from the comparison of plant genome sequences. Annual Review of Plant Biology 61: 349-372. <ref>[http://www.annualreviews.org/doi/abs/10.1146/annurev-arplant-042809-112235 Paterson, A.H., M. Freeling, H. Tang and X. Wang. 2010. Insights from the comparison of plant genome sequences. Annual Review of Plant Biology 61: 349-372. ]</ref> | + | http://www.sciencedirect.com/science/article/pii/S016777990900119X |
+ | |||
+ | Paterson, A.H., M. Freeling, H. Tang and X. Wang. 2010. Insights from the comparison of plant genome sequences. Annual Review of Plant Biology 61: 349-372. <ref> | ||
+ | [http://www.annualreviews.org/doi/abs/10.1146/annurev-arplant-042809-112235 Paterson, A.H., M. Freeling, H. Tang and X. Wang. 2010. Insights from the comparison of plant genome sequences. Annual Review of Plant Biology 61: 349-372. ]</ref> | ||
<references /> | <references /> |
Latest revision as of 07:54, 5 January 2015
job finish notify
mpack -s ${PWD}:done log kangyangjae@gmail.com
G.max genes for speciation
GMAA|Glyma14g07440.1 Glyma.14G068500.Wm82.a2.v1
GMAA|Glyma17g05020.2 Glyma.17G045400.Wm82.a2.v1
GMAA|Glyma17g05030.1 Glyma.17G045500.Wm82.a2.v1
GMAA|Glyma20g37700.1 Glyma.20G233300.Wm82.a2.v1
GMAA|Glyma01g36870.1 Glyma.01G164100.Wm82.a2.v1
GMAA|Glyma05g04860.1 Glyma.05G061300.Wm82.a2.v1
GMAA|Glyma07g00730.1 Glyma.07G005200.Wm82.a2.v1
GMAA|Glyma11g36980.3 Glyma.11G251500.Wm82.a2.v1
GMAA|Glyma12g04760.3 Glyma.12G043400.Wm82.a2.v1
GMAB|Glyma02g41510.2 Glyma.02G247800.Wm82.a2.v1
GMAB|Glyma13g17490.1 Glyma.13G114600.Wm82.a2.v1
GMAB|Glyma13g17450.1 Glyma.13G114300.Wm82.a2.v1
GMAB|Glyma10g29600.1 Glyma.10G155000.Wm82.a2.v1
GMAB|Glyma11g08390.1 Glyma.11G079200.Wm82.a2.v1
GMAB|Glyma17g15220.2 Glyma.17G143100.Wm82.a2.v1
GMAB|Glyma08g21980.2 Glyma.08G205700.Wm82.a2.v1
GMAB|Glyma18g00903.1 Glyma.18G005600.Wm82.a2.v1
GMAB|Glyma11g12550.3 Glyma.11G118000.Wm82.a2.v1
moleculo trim
--5' trim-- zcat TN1408D2532barcode1_LongRead.fastq.gz | perl ./prinseq-lite-0.20.4/prinseq-lite.pl -fastq stdin -out_format 1 -min_len 3000 -min_qual_mean 20 -trim_qual_left 30
regular expression
arabidopsis genename : bool(re.match('AT[0-9]G[0-9]{5}',x.upper())
peak annotation
> head(data) V1 V2 1 Aly 0.2766870 2 Aly 0.0920515 3 Aly 0.5407250 4 Aly 0.6232970 5 Aly 0.6338910 6 Aly 0.9712740
library(ggplot2) library(grid) library(plyr) densMode <- function(x){ td <- density(x) maxDens <- which.max(td$y) list(x=td$x[maxDens], y=td$y[maxDens]) } xdat <- ddply(data,"V1",transform,val_mean=signif(densMode(V2)$x,3), med.x = signif(densMode(V2)$x,3), med.y=signif(densMode(V2)$y,3)) hp <- ggplot(data=data, aes(x=V2)) + geom_density() + geom_vline(aes(xintercept=val_mean),xdat, color="red",linetype="dashed",size=1) + theme_bw() hp<- hp + facet_grid(V1 ~ .) + geom_text(data = xdat, aes(x=med.x,y=med.y,label=val_mean,size=10)) + xlim(0,1) hp
Change the orders
data$V1 <- factor(data$V1,levels = c("Tca", "Gra", "Ptr", "Ccl", "Egr", "Fve", "Mtr", "Vvi", "Sly", "Ath", "Aly", "Cru", "Bra", "Tha"))
R square for linear models data<-read.csv("") test<- lm(V1,V2, data) # V1 엔 x값 V2엔 y값 summary(test)
SNP density
> head(data) V1 V2 V3 V4 V5 1 Gm01 1845338 102 10 1/1 2 Gm01 2043826 127 18 1/1 3 Gm01 2043846 221 22 1/1 4 Gm01 2043914 124 16 1/1 5 Gm01 2043998 133 13 1/1 6 Gm01 2044202 191 18 1/1 ggplot(data, aes(x=V2,fill=V1)) + geom_histogram() + facet_wrap(~ V1)
msort
msort -k 1,n2 S_1_4_5_1.fastq.gz_bismark_bt2_pe.bismark.cov > S_1_4_5_1.fastq.gz_bismark_bt2_pe.bismark.cov.sort 2번 숫자로 정렬 이후 1번 정렬
ggplot examples
http://www.nature.com/psp/journal/v2/n10/full/psp201356a.html
synteny chromosome plot
python3 /data2/k821209/good_python_code/synteny_plot_parse.py Pv2Va.collinearity.kaks Pv2Va Pv2Va.gff python3 synteny_plot_parse.py Va2Gm.collinearity.kaks Va2Gm Va2Gm.gff cat Pv2Va.collinearity.kaks.Rin Va2Gm.collinearity.kaks.Rin Va2Vr.collinearity.kaks.Rin > merge.Rin
R data<-read.table('merge.Rin',header=F,sep='\t') sub<-subset(data,!grepl("s",data$V3)) # remove scaffolds sub<-subset(sub,!grepl("SS",sub$V3)) # remove super scaffolds png('all.png', height=600, width=1240, res = 100, units = 'px') print(ggplot(sub, aes(x=V4,y=V7,color=factor(V5))) + geom_point(alpha=0.5,size=1) + facet_grid(V3 ~ V1) + ylim(0,2)) dev.off()
gpu blast
Orthomcl 용
/data/k821209/programs/gpu-blast-1.1_ncbi-blast-2.2.26/ncbi-blast-2.2.26+-src/c++/GCC460-ReleaseMT64/bin/makeblastdb -in goodProteins.fasta -out goodProteins.fasta.sorted -sort_volumes /data/k821209/programs/gpu-blast-1.1_ncbi-blast-2.2.26/ncbi-blast-2.2.26+-src/c++/GCC460-ReleaseMT64/bin/blastp -query ./goodProteins.fasta -db ./goodProteins.fasta.sorted -gpu t -method 2 /data/k821209/programs/gpu-blast-1.1_ncbi-blast-2.2.26/ncbi-blast-2.2.26+-src/c++/GCC460-ReleaseMT64/bin/blastp -query ./goodProteins.fasta -dbsize 101358393 -evalue 1e-5 -out self.out -outfmt 6 -seg yes -db ./goodProteins.fasta.sorted -gpu t /data/k821209/programs/gpu-blast-1.1_ncbi-blast-2.2.26/ncbi-blast-2.2.26+-src/c++/GCC460-ReleaseMT64/bin/blastp -query ./adzuki.ver3.pep.fa -max_target_seqs 1 -evalue 1e-5 -out Va2Vn.bp.1e5.out7 -outfmt 6 -db ./ysp84_q20-scaffolds.fa.over10k.farenamed.fa.pep.fa.sorted -num_threads 2 -gpu t
Gene length density plot
python3 length_distribution.py Gmax_189_gene.gff3 G.max python3 length_distribution.py Vradi_ver6.gff.sort.gff V.radiata python3 length_distribution.py adzuki.ref.all.gff.maker.gff V.angularis python3 length_distribution.py Pvulgaris_218_v1.0.gene.gff P.vulgaris cat *.ld.out > ld.out.Rin R data<-read.table('ld.out.Rin',header=F) sub <- subset(data, V2=="CDS" | V2=="gene" | V2=="intron") ggplot(sub,aes(x=V3,color=V1)) + geom_density() + facet_grid(. ~ V2) + xlim(0,1000) + theme(legend.title=element_blank(),axis.text.x = element_text(angle = 90, hjust = 1)) + xlab('Length (bp)')
TE density plot
repeat masker output file repeat library blast file # 리핏 라이브러리를 잘 알려진 classification에 블라스트 해놓는다. python3 rmout2tbl.py R data<-read.table('adzuki.ref.fa.out.Rin',header=T) png('all.png', height=700, width=1000, res = 120, units = 'px') print(ggplot(data,aes(x=Position,fill=TE)) + geom_histogram(binwidth=100000) + xlim(0,40000000) + facet_grid(Chromosome ~ .)) dev.off()
Ks density plot with peak label
find ./Va2Vnip/ -name '*.kaks' | xargs cat | grep -v 'Sequence' > Va2Vnip.kaks sub V1 V2 1 GMA 1.2872000 2 GMA 0.8307020 3 GMA 1.1459092 4 GMA 0.5148436 5 GMA 0.9471062 6 GMA 0.7149813 h <- hist(sub$V2, breaks=seq(-2,7,by=0.01)) p + geom_density(aes(color=V1)) + annotate("text", label = sprintf("Peak : %s",h$breaks[findPeaks(h$density)[1]]) , x = 1, y = 1, size = 8, colour = "red")
KaKs calc result, scale gradient
input : Vang0002S00010.1.axt LWL 0.00258044 0.0126718 0.203637 0.0067590 ...
sub <- subset(data,0<V5 & V5<2) p <- ggplot(sub, aes(x=V4,y=V3,color=V5)) p + geom_point()+ scale_colour_gradient2(low='blue',mid='white',high='red',midpoint=1) + xlim(0,2) + ylim(0,2)
frequency 숫자로 가져오기
res<-hist(sub$V5,breaks=seq(0,2,by=0.02)) cbind(res$breaks,res$counts)
Venn dia gram 4 elements
Filename : ts.Rin Flower Leaf Pod Root Vang01g00010 Vang01g00010 Vang01g00010 Vang01g00010 Vang01g00020 Vang01g00030 Vang01g00030 Vang01g00020 Vang01g00030 Vang01g00050 Vang01g00050 Vang01g00030 Vang01g00050 Vang01g00110 Vang01g00110 Vang01g00050 Vang01g00110 Vang01g00120 Vang01g00120 Vang01g00110 Vang01g00120 Vang01g00160 Vang01g00160 Vang01g00120 Vang01g00140 Vang01g00170 Vang01g00170 Vang01g00140 Vang01g00160 Vang01g00250 Vang01g00250 Vang01g00160 Vang01g00170 Vang01g00270 Vang01g00270 Vang01g00170 Vang01g00180 Vang01g00310 Vang01g00310 Vang01g00250 Vang01g00210 Vang01g00320 Vang01g00320 Vang01g00270 .....
위와 같은 표를 준비한다.
source("http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/overLapper.R") sets <- read.delim("ts.Rin") setlistImp <- lapply(colnames(sets), function(x) as.character(sets[sets[,x]!="", x])) names(setlistImp) <- colnames(sets) setlist4 <- setlistImp[1:4]; OLlist4 <- overLapper(setlist=setlist4, sep="_", type="vennsets") counts <- sapply(OLlist4$Venn_List, length) vennPlot(counts=counts, ccol=c(rep(1,30),2), lcex=1.5, ccex=c(rep(1.5,5), rep(0.6,25),1.5))
MapMan GO parsing 193:/data2/k821209/good_python_code/
MapMan GO : Gmax_189.txt Va2Gm mcscanx 를 통해 Gm2Va.collinearity.kaks.recentpeakWGD 파일을 구한다. div_WGD_gen.Ks.py 참조 python3 mapman_GO.py
SSR based genetic map anchoring 193:/data2/k821209/good_python_code/anchor_SSRmap/
prepare 1: SSR.sts CEDC003 TTGCCAGAAAAGAAAGGAGC CAAGAAGTTTGCATTGCATC CEDC015 CATTTCGCTTACTTAAGCTC CACACTCATTTATTCTCACC CEDC024 CACACTCATTTATTCTCACC CTTTACTCTCAACATTTCGC
prepare 2: map_Va.txt group LG1 CEDG149 0 CEDG133 0.5 CEDG144 3.1 CEDG141 27.7 CEDG263 29.4
prepare 3: anchor target fasta
$ famap -tN -b genome.famap ./superscaf.ver1.fa $ fahash -b genome.hash -w 12 -f3 ${PWD}/genome.famap $ re-PCR -S genome.hash -n1 -g1 SSR.sts -o SSR.sts.mapped $ python3 multiple_map_remove.py ssr.sts.mapped $ python3 mapped_parse.py ssr.sts.mapped.demult $ python3 SSRmapped_sort.py ssr.sts.mapped.demult.parsed $ python3 Pseudo_mol_SSR.py
GBS 193:/data2/k821209/good_python_code/gbs_joinmap_anchor/
parallel -j 7 bcftools index {1} ::: *.bcf parallel -j 7 --colsep='\t' python3 genotyping.py {1} {2} :::: sites.txt
다음과 같은 형식으로 정리
SuperScaf_9_9607088 180 T C,T,C,C,C,T,T,T,C,N,H,C,C,T,T,C,T,C,C,C,T,T,H,C,C,C,T,T,C,C,T,C,C,T,C,T,T,C,C,T,H,T,C,T,T,T,T,C,C,T,T,C,T,C,C,T,C,C,T,C,C,T,T,T,T,C,T, .....
코드
cat *.out | python3 segregating_site.py > total.txt python3 N_dist.py total.txt joinmap4 에 넣어서 맵 완성 python3 joinmap_out_parse.py joinmap.out python3 double_in_check.py python3 ordered_sort.py python3 Pseudo_mol.py
soap3-dp
/data/k821209/programs/soap3-dp/4.2-amazon/soap3-dp-builder adzuki.ref.fasta # reference fasta file 에 컨티그나 스카폴드가 너무 많으면 에러남.. 몇개가 적정인지는 모르겠음 /data/k821209/programs/soap3-dp/4.2-amazon/BGS-Build adzuki.ref.fasta.index /data/k821209/programs/soap3-dp/4.2-amazon/soap3-dp pair adzuki.ref.fasta.index test_1.fastq test_2.fastq -u 500 -v 200 -b 3 /data/k821209/programs/samtools-0.1.19/samtools cat -o V_nepal_to_redbean.bam V_nepal_1_1.fastq.gz.gout.1 V_nepal_1_1.fastq.gz.gout.2 V_nepal_1_1.fastq.gz.gout.3 V_nepal_1_1.fastq.gz.gout.4 V_nepal_1_1.fastq.gz.dpout.1
gene prediction length distribution
python3 0_length_dist.py Gmax_189_gene.gff3 paste -d , Gmax_189_gene.gff3_CDS.ld.out Gmax_189_gene.gff3_gene.ld.out Gmax_189_gene.gff3_intron.ld.out > gm.paste python3 add_spcs.py gm.paste gm 모든 species에 대해서 같은 작업을 한뒤 *.Rin 중 하나만 첫번째 column을 Species로 이름을 정해주고 나머지들은 header 들을 지워준다. cat gm.Rin 등등 > merged.Rin # 해더가 있는 애들 처음으로 둔다. R library(ggplot2) Data <- read.csv("merged.Rin",header=T) > ggplot(Data, aes(x=CDS,colour=Species, fill = Species)) + geom_bar(alpha = 0.3,binwidth=50,position="identity")+ xlim(0,2000) > ggplot(Data, aes(x=CDS,colour=Species, fill = Species)) + geom_bar(alpha = 0.2,binwidth=50,position="identity")+ xlim(0,2000) > ggplot(Data, aes(x=intron,colour=Species, fill = Species)) + geom_bar(alpha = 0.2,binwidth=50,position="identity")+ xlim(0,2000) > ggplot(Data, aes(x=gene,colour=Species, fill = Species)) + geom_bar(alpha = 0.2,binwidth=500,position="identity")+ xlim(0,20000)
Density plot 가능 > ggplot(Data, aes(x=gene,colour=Species)) + geom_density(alpha = 0.4,position="identity")+ scale_x_continuous(limits = c(0, 2000)) 문제는 NA가 있는 row를 다 제거한다는 문제가 있다.
qplot
http://ggplot2.org/book/qplot.pdf
Jatropha transcriptome
244:/home/k821209/py/ # used code transdecoder runMe.sh 244:$ cdhit -i Trinity.fasta.transdecoder.pep -o Trinity.fasta.transdecoder.pep.cdhit.fa 244:$ python3 pep_header_parser.py Trinity.fasta.transdecoder.pep.cdhit.fa JPO # JPO : 3글자 약어를 각 종 마다 지정해줌
div_WGD_gen.Ks.py
input : filename.colinearity.kaks # MCSCANX result output : filename.colinearity.kaks.Rin R library(ggplot2) Data <- read.table("Va2Gm.collinearity.kaks.Rin",header=F) qplot(V1,data=Data,binwidth=0.01,xlim=c(0,3),ylim=c(0,150),xlab="Ks bin",ylab="Frequency")
CDHIT
est 63:$ /data/program/cd-hit-v4.6.1-2012-08-27/cd-hit-est -i merge.est.fa -o merge.est.fa.cdhit.fa -T 10 -M 10000
Phyml
193:$ mpirun -n 16 phyml-mpi -i concate.fa.phy -q -d aa -m JTT -b 500
Interpro
193:$ /data2/k821209/programs/interproscan-5.4-47.0/interproscan.sh --pathways --goterms -iprlookup -T /data2/k821209/Mungbean/maker4/TMP/ -i /data2/k821209/Redbean/maker/redbean.ref.maker.output/redbean.ref.fasta.pep.fa.noasterix.fa
jellyfish
$ sudo bash $ unlimit -a 4096
파일을 동시에 많이 쓰기 때문에 열어줘야된다.
$ zcat ysp-2_?.fastq.gz | jellyfish count -C -m 22 -s 50G -t 10 -c 6 -o output /dev/fd/0 $ jellyfish merge ./output_* # 결과 파일이 많을 경우 $ jellyfish histo -o hist.out output_0 $ R library(ggplot2) library(quantmod) Data<-read.table("hist.out",header=F) p <- qplot(V1,V2,data=Data,xlim=c(3,200),ylim=c(1,20000000),xlab="Kmer frequency",ylab="Count") p + annotate("text", label = sprintf("Peak : %s",findPeaks(Data$V2[2:200])[1]) , x = findPeaks(Data$V2[2:200])[1], y = Data$V2[findPeaks(Data$V2[2:200])[1]]+1000000, size = 8, colour = "red")
Species divergence time with fixed substitution rate
63:/data2/Mungbean_revision/speciation/Orthomcl/OnlyVigna/pairs
작업경로
63:/home/k821209/py/pairwiseKs/
저장경로
Orthomcl을 통해 얻은 Ortholog.txt가 필요 Orthomcl의 orthomclAdjustFasta로 만들어진 cds sequence set이 필요 $ python3 shared_ortholog.py $ python3 get_table.py
BEASTv1.8
http://www.molecularevolution.org/resources/activities/beast_activity/viruses
tutorial
193:/data2/k821209/good_python_code/BEAST/
쓰이는 코드가 저장되어있는 경로
PREPARE - MCSCANX - Vr2Cc.collinearity.kaks : V.radiata vs C. cajan Vr2Gm.collinearity.kaks : V.radiata vs G. max Vr2Wd.collinearity.kaks : V.radiata vs V.radiata var. sublobata (wild mungbean denovo assembly) Vra2Vrr.collinearity.kaks : V.radiata vs V.reflexo-pilosa var. glabra (allotetraploid denovo assembly) GM2GM.collinearity.kaks : G.max within genome synteny block Vrg2Vrg.collinearity.kaks : V.reflexo-pilosa var. glabra within genome synteny - ORTHOMCL -
$ python3 div_WGD_gen.Ks.py filename.kaks
본 스크립트를 실행하면 0.01 Ks window로 Frequency를 눈으로 볼 수 있는데 peak 가 보이는 영역 중 Frequency가 10 이상인 영역을 확인하고 코드 안에서 targetMin = 0.04, targetMax = 0.12 값을 수정해주면 filename.kaks.target이라는 파일을 생성한다. 해당 영역의 평균, 최대, 최소 MYA를 계산한다. 고정값 상수를 사용하므로 염두에 둔다. 그리고 생성되는 filename.kaks.recentpeakWGD 에는 설정한 범위에 해당하는 synteny pair들이 저장된다. Within genome 의 경우는 A genome, B genome을 구분할 기본 데이터가 되고, Inter genome 의 경우에는 high confident ortholog가 된다.
file_fa = 'final.assembly.11.fasta.nonATGC.fa.cds.primary.fasta.pep.sh.fa' file_vra2vrp = 'Vra2Vrr.collinearity.kaks.recentpeakWGD' file_vrp2vrp = 'Vrg2Vrg.collinearity.kaks.recentpeakWGD' $ python3 sep.py
Within genome synteny에서 A genome B genome 을 구분하는 코드. Within genome의 Recent peak을 A genome과 B genome관계라고 가정하고 V radiata와 가까운 Synteny pair를 A genome, 먼쪽을 B genome으로 저장한다. match.txt에는 A genome과 B genome내 유전자 matching 정보가 들어간다.
mcscanout_list = ['Vr2Cc.collinearity.kaks.recentpeakWGD','Vr2Gm.collinearity.kaks.recentpeakWGD','Vr2Wd.collinearity.kaks.recentpeakWGD','Vra2Vrr.collinearity.kaks.recentpeakWGD'] file_GMAA = 'GMAA.fasta' file_VRGA = 'VRPA.fasta' file_matchGM = 'match.GM.txt' file_matchVRG = 'match.Vrg.txt' file_fa = 'all.fas' $ python3 0_get_SYNB.py
V.radiata, C.cajan, G.max, V. radiata var. sublobata, V. reflexo-pilosa 간의 공유된 Synteny block을 GMAX A genome과 V.replexo A genome을 기준으로 가져온다. 본 연구에서 173개의 Locus가 나옴.
fa_list = glob.glob('*.prealn.fa') orthologs = 'orthologs.txt' dicAHD2seq = kang.Fasta2dic('all.fa') $ python3 0_ortholog_add.py
ortholog.txt는 Vigna species만 사용해서 만든 orthomcl pair파일. *.prealn.fa 는 0_get_SYNB.py 의 결과파일. all.fa는 필요한 시퀀스들.. orthomcl의 compliantFasta/ 를 모으면 됨.
$ parallel -j 10 /data2/k821209/programs/prank/bin/prank -d={1} -o={1}.aln ::: *.good.fa
결과 파일 Alignment
$ python3 3_fastaaln2nexus.py $ parallel -j 10 java -jar /data2/k821209/programs/prottest-3.4-20140123/prottest-3.4.jar -i {1} -S 0 -F -AIC -BIC -tc 0.5 -o {1}.prottest ::: *.good.fa.aln.best.fas.nex
nexus format으로 변환. 꼭 안바꿔도 되는듯(?)
$ cat *.good.fa | grep '>' > prespecies $ python3 species.py > species
beauti에 들어가는 species 파일 생성
BEAST 는 두번돌림. 50,000,000 change of length로 두번돌림
/data2/k821209/programs/BEASTv1.8.0/bin/logcombiner -trees -burnin 0 StarBEASTLog.species.trees StarBEASTLog.species.second.trees merged.trees
두번돌리고 Merge
Maker
$ /data2/k821209/programs/maker/bin/gff3_merge -d redbean.ref_master_datastore_index.log $ cat redbean.ref.all.gff | awk '$2=="maker"' - > redbean.ref.all.gff.maker.gff3 $ python3 sort.py redbean.ref.all.gff.maker.gff3 $ python3 genename_change_nonAnchor.py $ /data2/k821209/programs/maker/bin/map_gff_ids Vang.scaffold.map redbean.ref.all.gff $ cat redbean.ref.all.gff | awk '$2=="maker"' - > redbean.ref.all.gff.final.maker.gff3
Orthomcl
orthomcl 는 알아서 이름을 바꾸면된다. Make config file : orthomcl.config.template dbVendor=mysql dbConnectString=dbi:mysql:orthomcl:mysql_local_infile=1:localhost:3306 dbLogin=orthomcl dbPassword=8804555 similarSequencesTable=SimilarSequences orthologTable=Ortholog inParalogTable=InParalog coOrthologTable=CoOrtholog interTaxonMatchView=InterTaxonMatch percentMatchCutoff=50 evalueExponentCutoff=-5 oracleIndexTblSpc=NONE
$ mysql -u root -p mysql> CREATE DATABASE orthomcl; mysql> GRANT SELECT,INSERT,UPDATE,DELETE,CREATE VIEW,CREATE, INDEX, DROP on orthomcl.* TO orthomcl@localhost; mysql> set password for orthomcl@localhost = password('8804555'); $ orthomclInstallSchema orthomcl.config.template $ orthomclAdjustFasta VRA final.assembly.pep.fasta.primary.SH.fa 1 $ mkdir ./compliantFasta/ $ mv VRA.fasta ./compliantFasta/ # same for the ATH.fasta etc. $ orthomclFilterFasta ./compliantFasta/ 10 20
$ blastall -m 8 -a 10 -p blastp -F 'm S' -v 100000 -b 100000 -z 259701 -e 1e-5 -d goodProteins.fasta -i goodProteins.fasta -o goodProteins.fasta.self.out # 27개 할 때 보름 이상걸림 -z database size : grep -v '>' INPUT.fa | tr -d '\n' | wc $ gpu:/data/k821209/programs/gpu-blast-1.1_ncbi-blast-2.2.26/ncbi-blast-2.2.26+-src/c++/GCC460-ReleaseMT64/bin/blastp -query ./goodProteins.fasta -dbsize 101358393 -evalue 1e-5 -out self.out -outfmt 6 -seg yes -db ./goodProteins.fasta.sorted -gpu t
$ parallel -j 10 "blastall -m 8 -p blastp -F 'm S' -v 100000 -b 100000 -z 259701 -e 1e-5 -d goodProteins.fasta -i {1} -o {1}.out" ::: ???.fasta $ orthomclBlastParser goodProteins.fasta.self.out compliantFasta/ >> similarSequences.txt # 27개 accession할 때 이 스텝만 6시간 정도 걸림 $ orthomclLoadBlast orthomcl.config.template similarSequences.txt # 용량 많이 먹음 충분히 확보 $ orthomclPairs orthomcl.config.template orthomcl_pairs.log cleanup=no $ orthomclDumpPairsFiles orthomcl.config.template $ mcl mclInput --abc -I 1.5 -o mclOutput $ orthomclMclToGroups CLST 1000 < mclOutput > groups.txt $ python3 /data2/k821209/good_python_code/orthomcl/groups_count.py R source("http://faculty.ucr.edu/~tgirke/Documents/R_BioCond/My_R_Scripts/overLapper.R") sets <- read.delim("groups.txt.Rin") setlistImp <- lapply(colnames(sets), function(x) as.character(sets[sets[,x]!="", x])) names(setlistImp) <- colnames(sets) setlist5 <- setlistImp[1:5]; OLlist5 <- overLapper(setlist=setlist5, sep="_", type="vennsets") counts <- sapply(OLlist5$Venn_List, length) vennPlot(counts=counts, ccol=c(rep(1,30),2), lcex=1.5, ccex=c(rep(1.5,5), rep(0.6,25),1.5))
Errors execute failed: Can't create/write to file /tmp 어쩌고.. 이것은 mysql 버전에러라고 하는군. my.cnf 에서 tmp 디렉토리를 data 디렉토리랑 이름을 같게 만들어주면 돌아감. Duplicate entry error $ sort -k3,4 similarSequences.txt.primary > similarSequences.txt.primary.sort $ python3 duplicates_remove_ver1.py # 63:/data2/k821209/good_python_code/ 현재 테스트중
Protest
193server$ cd /data2/k821209/programs/prottest-3.4-20140123/ # 이 디렉토리로 이동해서 실행. 라이브러리 위치 때문에 따로 지정하기 귀찮기 때문에.. 193server$ parallel -j 10 java -jar prottest-3.4.jar -i {1} -o {1}.protest.out -all-distributions -F -AIC -BIC -tc 0.5 ::: /data2/k821209/Mungbean/Mungbean_assemble/ver6/mcscanx/beast_prepare/all/others_ver2/manual/*.nex
jModeltest for DNA alignment
63server $ parallel -j 10 java -jar /data/program/jmodeltest-2.1.5/jModelTest.jar -d {1} -g 4 -i -f -AIC -BIC -a -o {1}.jmodel ::: *.nuc.fas
bamUtil
bam2fastq
bam bam2FastQ --in filename.bam
Picard
sam2fastq
java -jar SamToFastq.jar INPUT=[file_bam] INCLUDE_NON_PRIMARY_ALIGNMENTS=True FASTQ=Name_1.txt SECOND_END_FASTQ=Name_2.txt VALIDATION_STRINGENCY=LENIENT
BEAST
beauti
/data2/k821209/programs/BEAST/bin/beauti -template /data2/k821209/programs/BEAST/templates/StarBeast.xml
beast
'single locus'
java -Xms256m -Xmx202400m -Djava.library.path=/data/program/BEAST/lib/ -jar /data/program/BEAST/lib/beast.jar -beagle_SSE -beagle_instances 10 ver7.xml
'multi loci'
java -Xms256m -Xmx202400m -Djava.library.path=/data/program/BEAST/lib/ -jar /data/program/BEAST/lib/beast.jar -beagle_SSE ver7.xml java -Xms256m -Xmx202400m -Djava.library.path=/data/program/BEAST/lib/ -jar /data/program/BEAST/lib/beast.jar -beagle_SSE 557.xml # 557 loci : Total calculation time: 373863.472 seconds
GPU with multi loci
C:\Users\jun\Desktop\kangyangjae\BEAST.v2.1.2\BEAST\557>java -Xmx15g -jar ..\lib\beast.jar -beagle_order 0,0,0,0,0,0,0,1
0,0,0,0,0,0,0,1 의 의미는 "7개 loci정보 cpu에 던지고 8번째는 gpu에 던져라" loci가 많을 경우는 반복으로 할당. gpu에 다 던지면 좋겠지만 gpu메모리 한계 때문에 이렇게 해야함.
18개 loci, 27 species alignment 에 GPU memory 770 Mb 정도 소요.
NCBI에 WGS 올리기 팁
꼼수이다. 참고로. 일단 그냥 Whole genome seqeunce fasta file을 그냥 WGS submission portal로 넣어버리고 하루를 기다린다. 뭐가 잘못되었다고 구구절절 온다. 이렇게 하라고도 구구절절온다. 그렇게 해서 보내준다. 끝.
https://submit.ncbi.nlm.nih.gov/subs/wgs/ http://www.ncbi.nlm.nih.gov/WebSub/template.cgi # template 만들기 논문정보 입력 ./linux64.tbl2asn -p ./ -i OP84_q20-scaffolds.fa.over1k.fa.cor.fa.nc.fa -M n -j "[organism=Elaeis guineensis]" -Z discrep -a r10k -l paired-ends -t template.sbt # Gapped assembly submission
Mutation rate
Legumes
"If the older duplication is assumed to have occurred around 58Myr ago, then the calculated rate of silent mutations extending back to the duplication would be 5.17*10-3, similar to previous estimates of 5.2*10–3" [1]
Python
Fisher's exact test
from scipy import stats
oddsratio, pvalue = stats.fisher_exact([[A,B], [C, D]]) [2]
63:/home/k821209/py/NGS/vcfq2fa.py : vcfutil로 만들어진 fq 파일을 fa로 변환
Excel
=TEXT(2.2323,"(0.00)")
(2.23)
Softwares
SNP 반영된 fasta 만들기
- java -jar /data/program/picard-tools-1.91/CreateSequenceDictionary.jar R=Gmax_189.fa O=Gmax_189.dict
- 인덱싱 작업인듯
- java -Xmx2g -jar /data/program/GenomeAnalysisTK-3.1-1/GenomeAnalysisTK.jar -R /data/ref/Gmax_189.fa -T FastaAlternateReferenceMaker -o JM14.bam.sort.bam.bcg.bcf.d2D50.vcf.fa --variant JM14.bam.sort.bam.bcg.bcf.d2D50.vcf
- bcftools view [filename.bcf] | /data1/KimSue/Gmax_189/ver1/bcf/bcf2fa.py prefix # GATK의 FastaAlternateReferenceMaker 는 calling이 되지 않는 지역을 reference 서열로 가져오는 문제가 있음.
GATK pipe http://www.broadinstitute.org/gatk//events/2038/
- bwa mem -M -t 10 Va.ref.fa ysp-2_1.fastq.gz ysp-2_2.fastq.gz | /data/program/samtools-0.1.19/samtools view -Sb - | /data/program/samtools-0.1.19/samtools sort - ysp.bwamem.Va.ref.fa.sort # GATK pipe는 -M 옵션이 필요
- /data/program/jdk1.7.0_25/bin/java -jar /data/program/picard-tools-1.91/MarkDuplicates.jar INPUT=ysp.bwamem.Va.ref.fa.sort.bam OUTPUT=ysp.bwamem.Va.ref.fa.sort.bam.dedup.bam METRICS_FILE=metrics.txt MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=1000 TMP_DIR=./tmp/
- MarkDuplicates 는 PCR duplicates를 제거하기 위한 작업. 왜 제거해야하는지는 자세히는 모르지만 PCR bias 때문에 Variant calling statistical model 에 문제가 된다고 함. 특히 PCR중에 polymerase가 실수 했을 경우 실수한 자리가 PCR bias로 대량 생산되었을 경우 마치 SNP처럼 calling되버림. 제거해야한다는 것이 결론. 그러나 특별히 제작된 라이브러리에는 적용해서는 안된다. 예를들면 특정 사이트가 잘리게 만들어놓은 라이브러리? 류들은 PCR bias로 오해받을 가능성? 이 있는듯..
- MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=1000 값을 높여도 되는데 ulimit -n 값을 올려야함. 재부팅 해야된다해서 그냥 낮춰 쓰는중.
그렇게 느린줄 모르겠음. 높여서 안써봐서 그런가.. - TMP_DIR=./tmp/ 이거 빼먹으면 용량 없음 크리를 먹는다.
Maker
- /data2/k821209/programs/maker/bin/gff3_merge -d Va.ref_master_datastore_index.log
- /data2/k821209/programs/maker/bin/maker_map_ids --prefix=Vang --iterate=1 --suffix=. Va.ref.all.gff > id_map.txt # 복잡하게 나오는 maker의 유전자 이름들을 심플하게 바꾸는 툴
- cat final.assembly_40k_GSFLX_pseudo5k.fasta.nonATGC.all.gff | awk '$2=="maker"' - > final.assembly_40k_GSFLX_pseudo5k.fasta.nonATGC.all.gff.maker.gff
- python3 gff_parse.py final.assembly_40k_GSFLX_pseudo5k.fasta.nonATGC.all.gff.maker.gff
- python3 /data2/k821209/Redbean/maker_pseudo/Va.ref.maker.output/genename_change_nonAnchor.py # 이름이 맘에 안들게 바뀌어서 개인적으로 만든 툴
- /data2/k821209/programs/maker/bin/map_gff_ids id_map.txt Va.ref.all.gff # 그리 나온 이름들을 gff 반영하는 툴
- python3 /data2/k821209/Redbean/maker_pseudo/Va.ref.maker.output/header_change.py Va.ref.all.maker.proteins.fasta Vang.scaffold.map # 그리 나온 이름들을 fasta에 반영하는 툴
- /data2/k821209/programs/maker/bin/iprscan2gff3 Va.ref.all.maker.proteins.fasta.tsv.hc.tsv Va.ref.all.gff > Va.ref.all.gff.ipr.gff # interpro result를 jbrowser에 들어가는 모양으로 만들어주는 툴
Deconseq [3]
- Illumina read의 contamination을 확인한다.
- 63:/data/program/deconseq-standalone-0.4.3
- /usr/bin/perl deconseq.pl -keep_tmp_files -f 800_both.fq -dbs bact,vir,arch -dbs_retain gmax
ePCR
Re-PCR
- $ famap -tN -b genome.famap ./superscaf.ver1.fa
- $ fahash -b genome.hash -w 12 -f3 ${PWD}/genome.famap
- Work> /data/program/e-PCR-2.3.12/re-PCR -S genome.hash -n1 -g1 SSR.sts -o SSR.sts.mapped
SSR.sts
Mungbean_SSR_ID_1 CAAAAACATGAGTTGCACACAA TCATAACGCAGAACAGCGAA
Mungbean_SSR_ID_2 ATGTGTGTGAGCACCTCGAC TTTGGCCATGCAAGATGTAA
Mungbean_SSR_ID_4 GCGGTTCACCTAGCCATAAA GGACCCTTCTGTGCGTGTAT
Mungbean_SSR_ID_5 GTTTGTGCTGCGGATTCTTT TTGGCAATTTGGACTAAGGC
Mungbean_SSR_ID_7 TTGACCCAAAACTTACCAATTT GCTAAGGACTGGGGGTCTTC
Mummer, alignment draft genome to finished genome
$nucmer --prefix=ref_qry ref.fasta qry.fasta
$show-coords -rcl ref_qry.delta > ref_qry.coords
$show-aligns ref_qry.delta refname qryname > ref_qry.aligns
$show-tiling ref_qry.delta > ref_qry.tiling
추천 논문
http://www.sciencedirect.com/science/article/pii/S016777990900119X
Paterson, A.H., M. Freeling, H. Tang and X. Wang. 2010. Insights from the comparison of plant genome sequences. Annual Review of Plant Biology 61: 349-372. [4]