Difference between revisions of "2017 Haneul Lab note"
(→Mungbean Chloroplast assembly) |
(→Mungbean pacbio assembly) |
||
(75 intermediate revisions by one user not shown) | |||
Line 187: | Line 187: | ||
samtools index vr.pb.cp_PE.sorted.bam | samtools index vr.pb.cp_PE.sorted.bam | ||
samtools mpileup -f vr.pb.cp.fasta -v -t DP,AD,ADF,ADR,SP,INFO/AD,INFO/ADF,INFO/ADR -u vr.pb.cp_PE.sorted.bam | bcftools call -v -m -O v > variants_PE_bwa.vcf | samtools mpileup -f vr.pb.cp.fasta -v -t DP,AD,ADF,ADR,SP,INFO/AD,INFO/ADF,INFO/ADR -u vr.pb.cp_PE.sorted.bam | bcftools call -v -m -O v > variants_PE_bwa.vcf | ||
+ | .... | ||
+ | bwa mem -t 4 vr.pb.cp.fasta pb.cp.fasta > vr.pb.cp_PB.sam | ||
+ | samtools view -Sb vr.pb.cp_PB.sam > vr.pb.cp_PB.bam | ||
+ | samtools sort vr.pb.cp_PB.bam -o vr.pb.cp_PB.sorted.bam | ||
+ | samtools index vr.pb.cp_PB.sorted.bam | ||
+ | .... | ||
+ | samtools mpileup -f vr.pb.cp.fasta -v -t DP,AD,ADF,ADR,SP,INFO/AD,INFO/ADF,INFO/ADR -u vr.pb.cp_PE.sorted.bam > variants_PE_bwa_all.vcf | ||
+ | python vcf_filtering.py variants_PE_bwa_all.vcf > variants_PE_bwa_all_0.15.vcf | ||
+ | |||
+ | == 2/21 == | ||
+ | === Mungbean Chloroplast assembly === | ||
+ | make a code that read fasta and annotation file(gff or gb) and make a fasta file with gene CDS sequence (63:/kev8305/Mungbean_assembly/chloroplast/) | ||
+ | python getCDS.py vr.pb.cp.fasta vr.pb.cp.gff > vr.pb.cp.gene.fasta | ||
+ | python getCDS.py v.radiata.fasta v.radiata.gb > v.radiata.gene.fasta | ||
+ | |||
+ | == 2/22 == | ||
+ | === Mungbean pacbio assembly === | ||
+ | snp calling done, snp filtering for genetic map construction (244:/kev8305/SK3/) | ||
+ | python ~/reseq/vcfparse_parent.py variants_snp.vcf KJ_falcon_scaffold_variants_snp.vcf | ||
+ | python ~/reseq/vcfparse.py variants_snp_compare_parents.vcf (dp >= 5, missing < 13, hetero < 10) | ||
+ | python ~/reseq/vcfparse_coseg.py variants_snp_compare_parents_filtered.vcf Mungbean_pacbio_scaffold_3.loc | ||
+ | python locparse.py Mungbean_pacbio_scaffold_3_seg_dist.loc > Mungbean_pacbio_scaffold_3_seg_dist_format.loc (scaffold name is too long, eliminate '|') | ||
+ | |||
+ | == 2/23 == | ||
+ | === Mungbean pacbio assembly === | ||
+ | too many snp for joinmap, so filtering missing < 12 | ||
+ | python ~/reseq/vcfparse.py variants_snp_compare_parents.vcf | ||
+ | python ~/reseq/vcfparse_coseg.py variants_snp_compare_parents_filtered.vcf Mungbean_pacbio_scaffold_4.loc | ||
+ | python ~/reseq/cal_seg_dist.py Mungbean_pacbio_scaffold_4.loc | ||
+ | 9110 | ||
+ | python locparse.py Mungbean_pacbio_scaffold_4_seg_dist.loc > Mungbean_pacbio_scaffold_4_seg_dist_format.loc | ||
+ | |||
+ | == 2/27 == | ||
+ | === Mungbean pacbio assembly === | ||
+ | Mugbean_pacbio_scaffold_7_seg_dist_foramt.loc : no hetero, missing < 18 | ||
+ | |||
+ | |||
+ | ALLMAPS install (244:/kev8305/skyts0401/program) | ||
+ | easy_install biopython numpy deap networkx matplotlib jcvi | ||
+ | wget https://dl.dropboxusercontent.com/u/15937715/Data/ALLMAPS/ALLMAPS-install.sh | ||
+ | sh ALLMAPS-install.sh | ||
+ | and, add directory include ALLMAPS binnary code(concorde,faSize,liftOver) to $PATH in ~/.profile | ||
+ | |||
+ | |||
+ | ALLMAPS (244:/kev8305/SK3/anchoring) | ||
+ | python ~/reseq/allmaps_format.py Mungbean_pacbio_5_joinmap.result > Mungbean_pacbio_5_joinmap.for.allmaps | ||
+ | python ~/reseq/allmaps_format.py Mungbean_pacbio_7_joinmap.result > Mungbean_pacbio_7_joinmap.for.allmaps | ||
+ | python -m jcvi.assembly.allmaps merge Mungbean_pacbio_5_joinmap.for.allmaps Mungbean_pacbio_7_joinmap.for.allmaps -o JM-2.bed | ||
+ | python -m jcvi.assembly.allmaps path JM-2.bed falcon_500_sspace.final.scaffolds.fasta.header.fasta | ||
+ | |||
+ | == 3/2 == | ||
+ | === Mungbean pacbio assembly === | ||
+ | MUMmer install, for dot plot between pacbio and previous ref | ||
+ | wget https://downloads.sourceforge.net/project/mummer/mummer/3.23/MUMmer3.23.tar.gz | ||
+ | tar -xvf MUMmer3.23.tar.gz | ||
+ | cd MUMmer3.23 | ||
+ | make check | ||
+ | make install | ||
+ | MUMmer3.23/mummer -mum -b -c Vradi.ver6.cor.fa.chr.fa JM-2.chr.fasta > ref_qry.mums | ||
+ | |||
+ | == 3/3 == | ||
+ | === Mungbean pacbio assembly === | ||
+ | lastz install (244:/kev8305/skyts0401/program/) | ||
+ | download from http://www.bx.psu.edu/~rsharris/lastz/ | ||
+ | tar -xvzf lastz-1.02.00.tar.gz | ||
+ | cd lastz-distrib-1.02.00/src/ | ||
+ | ---------------------------------- | ||
+ | problem with Makefile, so delete -Werror in line 31 of Makefile, save. | ||
+ | ---------------------------------- | ||
+ | make | ||
+ | make install | ||
+ | add path /home/skyts0401/lastz-distrib/bin in .profile | ||
+ | |||
+ | |||
+ | lastz (244:/kev8305/SK3/anchoring/) | ||
+ | lastz JM-2.chr.fasta[multiple] Vradi.ver6.cor.fa --notransition --step=20 --gfextend --chain --gapped --format=sam > old_new.sam | ||
+ | |||
+ | == 3/7 == | ||
+ | === Mungbean pacbio assembly === | ||
+ | MUMmer, having a problem with memory, was re-installed with a memory configuration | ||
+ | make clean | ||
+ | make CPPFLAGS="-O3 -DSIXTYFOURBITS" | ||
+ | make install | ||
+ | |||
+ | |||
+ | and use nucmer to align pacbio assembly and previous reference | ||
+ | MUMmer3.23/nucmer -maxmatch -c 100 -p ref_qry JM-2.chr.fasta Vradi.ver6.cor.fa | ||
+ | MUMmer3.23/nucmer --noextend -c 100 -p ref_qry_noextend JM-2.chr.fasta Vradi.ver6.cor.fa | ||
+ | |||
+ | |||
+ | and draw a dot plot using mummerplot | ||
+ | mummerplot --fat -l -png ref_qry_noextend.delta | ||
+ | but it occurs a error like | ||
+ | set mouse clipboardformat "[%.0f, %.0f]" | ||
+ | ^ | ||
+ | "out.gp", line 2594: wrong option | ||
+ | It seems gnuplot was updated, so doesn't support that option resulted from mummerplot. just edit out.gp to delete that line. | ||
+ | |||
+ | /kev8305/skyts0401/program/last-842/scripts/last-dotplot -2 'Vr*' -2 'scaffold_?' -x 1920 -y 1920 ref_qry.maf plot.png | ||
+ | |||
+ | == 3/16 ~ == | ||
+ | === Mungbean pacbio assembly === | ||
+ | compare between pacbio assembly and previous reference | ||
+ | |||
+ | 1. 50 reseq marker/LG on previous reference mapping on pacbio super scaffold for checking same marker is on same chromosome. (244:/kev8305/SK3/anchoring/check) | ||
+ | python SNP_marker_pos.py Vradi_ver6.fa Mungbean_chr_coseg_parse_seg_dist.loc > Vradi.ver6.reseq.marker.fasta | ||
+ | makeblastdb -in JM-2.chr.fasta -dbtype 'nucl' -out Mungbean_pacbio | ||
+ | blastn -db Mungbean_pacbio -query Vradi.ver6.reseq.marker.fasta -outfmt 6 -out reseq_marker.blast -num_threads 2 -evalue 1e-5 -word_size 100 | ||
+ | python blastparse.py reseq_marker.blast > reseq_marker_for_svg.result | ||
+ | python chr_compare_svg.py fasta.size reseq_marker_for_svg.result > chr_compare_3.svg (output can be changed based on option in python code) | ||
+ | |||
+ | /data2/skyts0401/program/circos-0.69-4/bin/circos -conf chr_compare.conf (193:/data2/skyts0401/check/circos) | ||
+ | |||
+ | 2. contig compare. (63:/data/skyts0401/Mungbean/assembly/) | ||
+ | scp assembly@147.46.250.181:/home/assembly/data/Mungbean/mapping/p_ctg.longest.fa . | ||
+ | scp skyts0401@147.46.250.244:/kev8305/SK3/anchoring/final.contigs.longest100.fa . | ||
+ | gmap_build -d pacbio_contig_new p_ctg.longest.fa -D ./ | ||
+ | gmap -d pacbio_contig_new -D pacbio_contig_new/ final.contigs.longest100.fa -t 12 -f 1 > pacbio_contig_compare.psl | ||
+ | -------------------------------------------------------------------------------------- | ||
+ | (NICEM:/home/assembly/check/) | ||
+ | ../bwa-0.7.15/bwa mem -t 30 p_ctg.longest.longest3.fa SunhwaN_1.fastq.gz SunhwaN_2.fastq.gz > newcontig_illumina.sam | ||
+ | |||
+ | (244:/kev8305/SK3/anchoring/check/) | ||
+ | ln -s /NGS/NGS/VignaRadiata/DNA/Sunhwa_pacbio/filtered_subreads.fasta . | ||
+ | bwa index p_ctg.longest.longest1.fa | ||
+ | bwa mem -t 8 p_ctg.longest.longest1.fa filtered_subreads.fasta > newcontig_pacbio.sam | ||
+ | |||
+ | samtools view -Sb newcontig_pacbio.sam > newcontig_pacbio.bam | ||
+ | samtools sort newcontig_pacbio.bam -o newcontig_pacbio.sorted.bam | ||
+ | samtools index newcontig_pacbio.sorted.bam | ||
+ | ~ same samtools command with newcontig_illumina.sam ~ | ||
+ | |||
+ | Find that something looked splited mapping, so re-align with end-to-end method of bowtie2 | ||
+ | (NICEM:~/check/) | ||
+ | ~/bowtie2-2.2.9/bowtie2-build p_ctg.longest.longest1.fa p_ctg.longest.longest1.fa | ||
+ | ~/bowtie2-2.2.9/bowtie2 -x p_ctg.longest.longest1.fa -1 SunhwaN_1.fastq.gz -2 SunhwaN_2.fastq.gz --end-to-end --very-fast -p 30 -S newcontig_illumina_endtoend.sam | ||
+ | ~/bowtie2-2.2.9/bowtie2 -x p_ctg.longest.longest1.fa -f filtered_subreads.fasta --end-to-end --very-fast -p 30 -S newcontig_pacbio_endtoend.sam | ||
+ | |||
+ | !!!bowtie2-2.3.0 version has a bug!!! | ||
+ | |||
+ | (244:/kev8305/SK3/anchoring/check/) | ||
+ | scp assembly@147.46.250.181:/home/assembly/check/newcontig_pacbio_endtoend.sam . | ||
+ | scp assembly@147.46.250.181:/home/assembly/check/newcontig_illumina_endtoend.sam . | ||
+ | ~ same samtools command, view, sort, index ~ | ||
+ | samtools depth -a -q 0 -Q 0 -r 000000F:2000000-4000000 newcontig_illumina_endtoend.sorted.bam > newcontig_illumina_endtoend.mapping.depth | ||
+ | samtools depth -a -q 0 -Q 0 -r 000000F:2000000-4000000 newcontig_pacbio_endtoend.sorted.bam > newcontig_pacbio_endtoend.mapping.depth | ||
+ | |||
+ | blat for comparing contig (NICEM:/home/assembly/check/, 244:/kev8305/SK3/anchoring/check/) | ||
+ | ------------------------------ | ||
+ | (contig_compare.sh) | ||
+ | #!/bin/bash | ||
+ | |||
+ | for i in {0..19}; do | ||
+ | ../blat p_ctg.longest.longest3.fa final.contigs_devide${i}.fa contig_compare_all_${i}.psl & | ||
+ | done | ||
+ | |||
+ | wait | ||
+ | ------------------------------ | ||
+ | |||
+ | (NICEM) | ||
+ | python fasta_devide.py final.contigs.reformed.fasta | ||
+ | chmod a+x contig_compare.sh | ||
+ | ./contig_compare.sh | ||
+ | ls contig_compare_all_*.psl > psl.list | ||
+ | nano pslfilter.py | ||
+ | python pslfilter.py psl.list > conitg_compare_all.result | ||
+ | python pslfilter2.py contig_compare_all.result > contig_compare_all_filtered.result | ||
+ | |||
+ | == 4/18 == | ||
+ | === Jatropha assembly === | ||
+ | make Jatropha figure(chr - lg) for new version(allmaps) (244:/kev8305/skyts0401/Jatropha) | ||
+ | scp skyts0401@147.46.250.63:/home/skyts0401/svg/make_chr_lg_svg.py make_chr_lg_svg_revised_for_allmaps.py | ||
+ | python make_chr_lg_svg_revised_for_allmaps.py Jatropha_map1.result Jatropha.allmaps.agp > Jatropha_chr_lg.svg | ||
+ | |||
+ | == 4/26 == | ||
+ | === Mungbean pacbio assembly === | ||
+ | mungbean super scaffold (JM-2.fasta) was gap filled. Final assembly Fasta is in /kev8305/SK3/anchoring/gapfilled_assembly_final/ | ||
+ | |||
+ | == 5/1 == | ||
+ | === Mungbean pacbio assembly === | ||
+ | Repeat masking progress is based on these sites: http://weatherby.genetics.utah.edu/MAKER/wiki/index.php/Repeat_Library_Construction-Advanced, http://www.repeatmasker.org/ | ||
+ | |||
+ | |||
+ | <big>'''Repeat masking program installation'''</big> | ||
+ | |||
+ | |||
+ | Repbase - for RepeatMasker | ||
+ | (63:/data/skyts0401/program/) | ||
+ | should register http://www.girinst.org/ | ||
+ | download RepBaseRepeatMaskerEdition | ||
+ | cp RepBaseRepeatMaskerEdition-20170127\ \(1\).tar.gz RepeatMasker/. | ||
+ | cd RepeatMasker/ | ||
+ | tar -xvzf RepBaseRepeatMaskerEdition-20170127\ \(1\).tar.gz | ||
+ | (Libraries/ diretory will be created and all file will be copied to RepeatMasker/Libraries/) | ||
+ | |||
+ | rmblast - for RepeatMasker (ver 2.6.0 has problem with install, so I installed v. 2.2.28) | ||
+ | (63:/data/skyts0401/program/) | ||
+ | download from http://www.repeatmasker.org/RMBlast.html | ||
+ | wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/rmblast/2.2.28/ncbi-rmblastn-2.2.28-x64-linux.tar.gz | ||
+ | wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.2.28/ncbi-blast-2.2.28+-x64-linux.tar.gz | ||
+ | tar zxvf ncbi-blast-2.2.28+-x64-linux.tar.gz | ||
+ | tar zxvf ncbi-rmblastn-2.2.28-x64-linux.tar.gz | ||
+ | cp -R ncbi-rmblastn-2.2.28/* ncbi-blast-2.2.28+/ | ||
+ | rm -rf ncbi-rmblastn-2.2.28 | ||
+ | mv ncbi-blast-2.2.28+ rmblast-2.2.28 | ||
+ | |||
+ | trf - for RepeatMasker | ||
+ | (63:/data/skyts0401/program/) | ||
+ | download from http://tandem.bu.edu/trf/trf.html | ||
+ | chmod a+x trf409.linux64 | ||
+ | ln -s trf409.linux64 RepeatMasker/trf | ||
+ | |||
+ | RepeatMasker | ||
+ | (63:/data/skyts0401/program/) | ||
+ | wget http://www.repeatmasker.org/RepeatMasker-open-4-0-7.tar.gz | ||
+ | tar -xzf RepeatMasker-open-4-0-7.tar.gz | ||
+ | cd RepeatMasker/ | ||
+ | (move the Repbase library to RepeatMasker/Libraries/) | ||
+ | perl ./configure | ||
+ | configure directory of trf, rmblast | ||
+ | |||
+ | muscle - for MITE-Hunter | ||
+ | (63:/data/skyts0401/program/) | ||
+ | check version on http://www.drive5.com/muscle/ | ||
+ | wget http://www.drive5.com/muscle/downloads3.8.31/muscle3.8.31_i86linux64.tar.gz | ||
+ | tar -xvzf muscle3.8.31_i86linux64.tar.gz | ||
+ | mkdir muscle | ||
+ | muscle3.8.31_i86linux64 muscle/ | ||
+ | |||
+ | mdust - for MITE-Hunter | ||
+ | (63:/data/skyts0401/program/) | ||
+ | wget ftp://occams.dfci.harvard.edu/pub/bio/tgi/software//seqclean/mdust.tar.gz | ||
+ | tar -xvzf mdust.tar.gz | ||
+ | |||
+ | MITE-Hunter | ||
+ | (63:/data/skyts0401/program/) | ||
+ | check version on http://target.iplantcollaborative.org/mite_hunter.html | ||
+ | wget http://target.iplantcollaborative.org/mite_hunter/MITE%20Hunter-11-2011.zip | ||
+ | unzip MITE\ Hunter-11-2011.zip | ||
+ | mv MITE\ Hunter/ MITE_Hunter | ||
+ | cd MITE_Hunter/ | ||
+ | perl MITE_Hunter_Installer.pl -d /data/skyts0401/program/MITE\ Hunter -f formatdb -b blastall -m /data/skyts0401/program/mdsut -M /data/skyts0401/program/muscle | ||
+ | |||
+ | GenomeTools | ||
+ | (63:/data/skyts0401/program/) | ||
+ | check version on http://genometools.org/ | ||
+ | wget http://genometools.org/pub/genometools-1.5.9.tar.gz | ||
+ | tar -xvzf genometools-1.5.9.tar.gz | ||
+ | cd genometools-1.5.9/ | ||
+ | make | ||
+ | sudo make install | ||
+ | - if have a problem with dependency, please check this - | ||
+ | sudo apt-get install libcairo2-dev | ||
+ | sudo apt-get install libpango1.0-dev | ||
+ | |||
+ | Genome tRNA database | ||
+ | (63:/home/skyts0401/bin/) | ||
+ | check version on http://gtrnadb.ucsc.edu | ||
+ | wget http://gtrnadb2009.ucsc.edu/download/tRNAs/eukaryotic-tRNAs.fa.gz | ||
+ | gunzip eukaryotic-tRNAs.fa.gz | ||
+ | |||
+ | CRL scripts | ||
+ | (63:/home/skyts0401/bin/) | ||
+ | wget http://www.hrt.msu.edu/uploads/535/78637/CRL_Scripts1.0.tar.gz | ||
+ | tar -xvzf CRL_Scripts1.0.tar.gz | ||
+ | |||
+ | transposons protein database | ||
+ | (63:/home/skyts0401/bin/) | ||
+ | wget http://www.hrt.msu.edu/uploads/535/78637/Tpases020812DNA.gz | ||
+ | gunzip Tpases020812DNA.gz | ||
+ | wget http://www.hrt.msu.edu/uploads/535/78637/Tpases020812.gz | ||
+ | gunzip Tpases020812.gz | ||
+ | |||
+ | plant protein database | ||
+ | (63:/home/skyts0401/bin/) | ||
+ | wget http://www.hrt.msu.edu/uploads/535/78637/alluniRefprexp070416.gz | ||
+ | gunzip alluniRefprexp070416.gz | ||
+ | |||
+ | RECON - for RepeatModeler | ||
+ | (63:/data/skyts0401/program/) | ||
+ | wget http://www.repeatmasker.org/RepeatModeler/RECON-1.08.tar.gz | ||
+ | tar -xvzf RECON-1.08.tar.gz | ||
+ | cd RECON-1.08/src/ | ||
+ | make | ||
+ | make install | ||
+ | cd ../scripts/ | ||
+ | nano recon.pl (added /data/skyts0401/program/RECON-1.08/bin to PATH = "" (third line)) | ||
+ | |||
+ | RepeatScout - for RepeatModeler | ||
+ | (63:/data/skyts0401/program/) | ||
+ | wget http://www.repeatmasker.org/RepeatScout-1.0.5.tar.gz | ||
+ | tar -xvzf RepeatScout-1.0.5.tar.gz | ||
+ | cd RepeatScout-1/ | ||
+ | make | ||
+ | sudo make install | ||
+ | |||
+ | nseg - for RepeatModeler | ||
+ | (63:/data/skyts0401/program/) | ||
+ | mkdir nseg | ||
+ | cd nseg | ||
+ | wget ftp://ftp.ncbi.nih.gov/pub/seg/nseg/* . | ||
+ | make | ||
+ | |||
+ | RepeatModeler | ||
+ | (63:/data/skyts0401/program/) | ||
+ | wget http://www.repeatmasker.org/RepeatModeler/RepeatModeler-open-1.0.9.tar.gz | ||
+ | tar -xvzf RepeatModeler-open-1.0.9.tar.gz | ||
+ | cd RepeatModeler-open-1.0.9/ | ||
+ | perl ./configure | ||
+ | configure directory of RECON, RepeatScout, nseg, trf, rmblast | ||
+ | |||
+ | hmmer - for ProtExcluder | ||
+ | (63:/data/skyts0401/program/) | ||
+ | wget http://eddylab.org/software/hmmer3/3.1b2/hmmer-3.1b2-linux-intel-x86_64.tar.gz | ||
+ | tar -xvzf hmmer-3.1b2-linux-intel-x86_64.tar.gz | ||
+ | cd hmmer-3.1b2-linux-intel-x86_64/ | ||
+ | ./configure | ||
+ | make | ||
+ | sudo make install | ||
+ | |||
+ | ProtExcluder | ||
+ | wget http://www.hrt.msu.edu/uploads/535/78637/ProtExcluder1.2.tar.gz | ||
+ | tar -xvzf ProtExcluder1.2.tar.gz | ||
+ | cd ProtExcluder1.2/ | ||
+ | ./Installer.pl -m /data/skyts0401/program/hmmer-3.1b2-linux-intel-x86_64/binaries/ -p /data/skyts0401/program/ProtExcluder1.2/ | ||
+ | |||
+ | |||
+ | <big>'''Repeat masking progress'''</big> | ||
+ | |||
+ | Basic command which I used is based on http://weatherby.genetics.utah.edu/MAKER/wiki/index.php/Repeat_Library_Construction-Advanced | ||
+ | |||
+ | |||
+ | Move Mungbean genome assembly final version | ||
+ | scp skyts0401@147.46.250.244:/kev8305/SK3/anchoring/gapfilled_assembly_final/standard_output.gapfilled.final.fa . | ||
+ | |||
+ | MITE library | ||
+ | (63:/data/skyts0401/program/MITE_Hunter/) | ||
+ | perl MITE_Hunter_manager.pl -i /data/skyts0401/Mungbean/assembly/standard_output.gapfilled.final.fa -g Mungbean -c 10 -S 12345678 | ||
+ | mv Mungbean* /data/skyts0401/Mungbean/repeatmask/MITE/. | ||
+ | cd /data/skyts0401/Mungbean/repeatmask/MITE/ | ||
+ | cat Mungbean_Step8_*.fa > ../MITE.lib | ||
+ | |||
+ | LTR library | ||
+ | (63:/data/skyts0401/Mungbean/repeatmask/LTR/99/) | ||
+ | ln -s /data/skyts0401/Mungbean/assembly/standard_output.gapfilled.final.fa . | ||
+ | gt suffixerator -db standard_output.gapfilled.final.fa -indexname Mungbean_LTR -tis -suf -lcp -des -ssp -dna | ||
+ | gt ltrharvest -index Mungbean_LTR -out Mungbean.out99 -outinner Mungbean.outinner99 -gff3 Mungbean.gff99 -minlenltr 100 -maxlenltr 6000 0ministltr 1500 -maxdistltr 25000 -mintsd 5 -maxtsd 5 -motif tgca -similar 99 -vic 10 > Mungbean.result99 | ||
+ | gt gff3 -sort Mungbean.gff99 > Mungbean.gff99.sort | ||
+ | gt ltrdigest -trnas ~/bin/eukaryotic-tRNAs.fa Mungbean.gff99.sort Mungbean_LTR > Mungbean.gff99.dgt | ||
+ | perl ~/bin/CRL_Scripts1.0/CRL_Step1.pl --gff Mungbean.gff99.dgt | ||
+ | perl ~/bin/CRL_Scripts1.0/CRL_Step2.pl --step1 CRL_Step1_Passed_Elements.txt --repeatfile Mungbean.out99 --resultfile Mungbean.result99 --sequencefile standard_output.gapfilled.final.fa --removed_repeats CRL_Step2_Passed_Elements.fasta | ||
+ | mkdir fasta_files | ||
+ | mv Repeat_*.fasta fasta_files/\ | ||
+ | mv Repeat_*.fasta fasta_files/ | ||
+ | mv CRL_Step2_Passed_Elements.fasta fasta_files/ | ||
+ | cd fasta_files/ | ||
+ | perl ~/bin/CRL_Scripts1.0/CRL_Step3.pl --directory . --step2 CRL_Step2_Passed_Elements.fasta --pidentity 60 --seq_c 25 | ||
+ | mv CRL_Step3_Passed_Elements.fasta .. | ||
+ | cd .. | ||
+ | perl ~/bin/CRL_Scripts1.0/ltr_library.pl --resultfile Mungbean.result99 --step3 CRL_Step3_Passed_Elements.fasta --sequencefile standard_output.gapfilled.final.fa | ||
+ | cp lLTR_Only.lib ../lLTR_Only_99.lib | ||
+ | cat lLTR_Only.lib ../../MITE/MITE.lib > repeats_to_mask_LTR99.fasta | ||
+ | /data/skyts0401/program/RepeatMasker/RepeatMasker -lib repeats_to_mask_LTR99.fasta -nolow -dir . Mungbean.outinner99 | ||
+ | perl ~/bin/CRL_Scripts1.0/cleanRM.pl Mungbean.outinner99.out Mungbean.outinner99.masked > Mungbean.outinner99.unmasked | ||
+ | perl ~/bin/CRL_Scripts1.0/rmshortinner.pl Mungbean.outinner99.unmasked 50 > Mungbean.outinner99.clean | ||
+ | makeblastdb -in ~/bin/Tpases020812DNA -dbtype prot | ||
+ | blastx -query Mungbean.outinner99.clean -db ~/bin/Tpases020812DNA -evalue 1e-10 -num_descriptions 10 -out Mungbean.outinner99.clean_blastx.out.txt | ||
+ | perl ~/bin/CRL_Scripts1.0/outinner_blastx_parse.pl --blastx Mungbean.outinner99.clean_blastx.out.txt --outinner Mungbean.outinner99 | ||
+ | perl ~/bin/CRL_Scripts1.0/CRL_Step4.pl --step3 CRL_Step3_Passed_Elements.fasta --resultfile Mungbean.result99 --innerfile passed_outinner_sequence.fasta --sequencefile standard_output.gapfilled.final.fa | ||
+ | makeblastdb -in lLTRs_Seq_For_BLAST.fasta -dbtype nucl | ||
+ | blastn -query lLTRs_Seq_For_BLAST.fasta -db lLTRs_Seq_For_BLAST.fasta -evalue 1e-10 -num_descriptions 1000 -out lLTRs_Seq_For_BLAST.fasta.out | ||
+ | makeblastdb -in Inner_Seq_For_BLAST.fasta -dbtype nucl | ||
+ | blastn -query Inner_Seq_For_BLAST.fasta -db Inner_Seq_For_BLAST.fasta -evalue 1e-10 -num_descriptions 1000 -out Inner_Seq_For_BLAST.fasta.out | ||
+ | perl ~/bin/CRL_Scripts1.0/CRL_Step5.pl --LTR_blast lLTRs_Seq_For_BLAST.fasta.out --inner_blast Inner_Seq_For_BLAST.fasta.out --step3 CRL_Step3_Passed_Elements.fasta --final LTR99.lib --pcoverage 90 --pidentity 80 | ||
+ | |||
+ | relatively old LTR (Same command with above one, but for relatively old LTR) | ||
+ | (63:/data/skyts0401/Mungbean/repeatmask/LTR/85) | ||
+ | (to avoid confuse LTR_99 with this results, make directory 99 and 85 in LTR directory) | ||
+ | ln -s /data/skyts0401/Mungbean/assembly/standard_output.gapfilled.final.fa . | ||
+ | gt suffixerator -db standard_output.gapfilled.final.fa -indexname Mungbean_LTR -tis -suf -lcp -des -ssp -dna | ||
+ | gt ltrharvest -index Mungbean_LTR -out Mungbean.out85 -outinner Mungbean.outinner85 -gff3 Mungbean.gff85 -minlenltr 100 -maxlenltr 6000 -mindistltr 1500 -maxdistltr 25000 -mintsd 5 -maxtsd 5 -vic 10 > Mungbean.result85 | ||
+ | cp ../99/CRL_Step1_Passed_Elements.txt . | ||
+ | perl ~/bin/CRL_Scripts1.0/CRL_Step2.pl --step1 CRL_Step1_Passed_Elements.txt --repeatfile Mungbean.out85 --resultfile Mungbean.result85 --sequencefile standard_output.gapfilled.final.fa --removed_repeats CRL_Step2_Passed_Elements.fasta | ||
+ | mkdir fasta_files | ||
+ | mv Repeat_*.fasta fasta_files/ | ||
+ | mv CRL_Step2_Passed_Elements.fasta fasta_files/ | ||
+ | cd fasta_files/ | ||
+ | perl ~/bin/CRL_Scripts1.0/CRL_Step3.pl --directory . --step2 CRL_Step2_Passed_Elements.fasta --pidentity 60 --seq_c 25 | ||
+ | mv CRL_Step3_Passed_Elements.fasta .. | ||
+ | cd .. | ||
+ | perl ~/bin/CRL_Scripts1.0/ltr_library.pl --resultfile Mungbean.result85 --step3 CRL_Step3_Passed_Elements.fasta --sequencefile standard_output.gapfilled.final.fa | ||
+ | cp lLTR_Only.lib ../lLTR_Only_85.lib | ||
+ | cat lLTR_Only.lib ../../MITE/MITE.lib > repeats_to_mask_LTR85.fasta | ||
+ | /data/skyts0401/program/RepeatMasker/RepeatMasker -lib repeats_to_mask_LTR85.fasta -nolow -dir . Mungbean.outinner85 | ||
+ | perl ~/bin/CRL_Scripts1.0/cleanRM.pl Mungbean.outinner85.out Mungbean.outinner85.masked > Mungbean.outinner85.unmasked | ||
+ | perl ~/bin/CRL_Scripts1.0/rmshortinner.pl Mungbean.outinner85.unmasked 50 > Mungbean.outinner85.clean | ||
+ | makeblastdb -in ~/bin/Tpases020812DNA -dbtype prot | ||
+ | blastx -query Mungbean.outinner85.clean -db ~/bin/Tpases020812DNA -evalue 1e-10 -num_descriptions 10 -out Mungbean.outinner85.clean_blastx.out.txt | ||
+ | perl ~/bin/CRL_Scripts1.0/outinner_blastx_parse.pl --blastx Mungbean.outinner85.clean_blastx.out.txt --outinner Mungbean.outinner85 | ||
+ | perl ~/bin/CRL_Scripts1.0/CRL_Step4.pl --step3 CRL_Step3_Passed_Elements.fasta --resultfile Mungbean.result85 --innerfile passed_outinner_sequence.fasta --sequencefile standard_output.gapfilled.final.fa | ||
+ | makeblastdb -in lLTRs_Seq_For_BLAST.fasta -dbtype nucl | ||
+ | blastn -query lLTRs_Seq_For_BLAST.fasta -db lLTRs_Seq_For_BLAST.fasta -evalue 1e-10 -num_descriptions 1000 -out lLTRs_Seq_For_BLAST.fasta.out | ||
+ | makeblastdb -in Inner_Seq_For_BLAST.fasta -dbtype nucl | ||
+ | blastn -query Inner_Seq_For_BLAST.fasta -db Inner_Seq_For_BLAST.fasta -evalue 1e-10 -num_descriptions 1000 -out Inner_Seq_For_BLAST.fasta.out | ||
+ | perl ~/bin/CRL_Scripts1.0/CRL_Step5.pl --LTR_blast lLTRs_Seq_For_BLAST.fasta.out --inner_blast Inner_Seq_For_BLAST.fasta.out --step3 CRL_Step3_Passed_Elements.fasta --final LTR85.lib --pcoverage 90 --pidentity 8 | ||
+ | /data/skyts0401/program/RepeatMasker/RepeatMasker -lib ../99/LTR99.lib -dir . LTR85.lib | ||
+ | perl ~/bin/CRL_Scripts1.0/remove_masked_sequence.pl --masked_elements LTR85.lib.masked --outfile FinalLTR85.lib | ||
+ | cd .. | ||
+ | cat 99/LTR99.lib 85/FinalLTR85.lib > allLTR.lib | ||
+ | |||
+ | Collecting repetitive sequences | ||
+ | ln -s /data/skyts0401/Mungbean/assembly/standard_output.gapfilled.final.fa . | ||
+ | nano fasta_devide.py | ||
+ | python fasta_devide.py standard_output.gapfilled.final.fa | ||
+ | nano repeatmask_combine.sh | ||
+ | chmod a+x repeatmask_combine.sh | ||
+ | ./repeatmask_combine.sh | ||
+ | cat standard_output.gapfilled.final_devide*.fa.masked > standard_output.gapfilled.final.fa.masked | ||
+ | perl ~/bin/CRL_Scripts1.0/rmaskedpart.pl standard_output.gapfilled.final.fa.masked 50 > umseqfile | ||
+ | /data/skyts0401/program/RepeatModeler-open-1.0.9/BuildDatabase -name umseqfildeb -engine ncbi umseqfile | ||
+ | nohup /data/skyts0401/program/RepeatModeler-open-1.0.9/RepeatModeler -database umseqfiledb >& umseqfile.out | ||
+ | perl ~/bin/CRL_Scripts1.0/repeatmodeler_parse.pl --fastafile consensi.fa.classified --unknowns repeatmodeler_unknowns.fasta --identities repeatmodeler_identities.fasta | ||
+ | makeblastdb -in ~/bin/Tpases020812 -dbtype prot | ||
+ | blastx -query repeatmodeler_unknowns.fasta -db ~/bin/Tpases020812 -evalue 1e-10 -num_descriptions 10 -out modelerunknown_blast_result.txt | ||
+ | ~/bin/CRL_Scripts1.0/transposon_blast_parse.pl --blastx modelerunknown_blast_result.txt --modelerunknown repeatmodeler_unknowns.fasta | ||
+ | mv unknown_elements.txt ModelerUnknown.lib | ||
+ | cat identified_elements.txt repeatmodeler_identities.fasta > ModelerID.lib | ||
+ | |||
+ | Exclusion of gene fragments | ||
+ | makeblastdb -in ~/bin/alluniRefprexp070416 -dbtype prot | ||
+ | blastx -query ModelerUnknown.lib -db ~/bin/alluniRefprexp070416 -evalue 1e-10 -num_descriptions 10 -out ModelerUnknown.lib_blast_result.txt | ||
+ | cd LTR/ | ||
+ | python headerforamt.py allLTR.lib > allLTR.lib.reformed (LTR library has '(' symbol, resulting in ProtExcluder error, so change the format) | ||
+ | cd .. | ||
+ | mkdir ProtExclude | ||
+ | cd ProtExclude/ | ||
+ | cp ../MITE/MITE.lib . | ||
+ | cp ../LTR/allLTR.lib.reformed . | ||
+ | cp ../ModelerID.lib . | ||
+ | cp ../ModelerUnknown.lib . | ||
+ | cat allLTR.lib.reformed MITE.lib ModelerID.lib > KnownRepeats.lib | ||
+ | cat KnownRepeats.lib ModelerUnknown.lib > allRepeats.lib | ||
+ | blastx -query allRepeats.lib -db ~/bin/alluniRefprexp070416 -evalue 1e-10 -num_descriptions 10 -out allRepeats.lib_blast_results.txt | ||
+ | /data/skyts0401/program/ProtExcluder1.2/ProtExcluder.pl allRepeats.lib_blast_results.txt allRepeats.lib | ||
+ | |||
+ | == 5/26 == | ||
+ | === Mungbean pacbio assembly === | ||
+ | For assessment of assembly, run CEGMA and BUSCO | ||
+ | |||
+ | |||
+ | <big>'''Install'''</big> | ||
+ | |||
+ | CEGMA | ||
+ | (63:/data/skyts0401/program/) | ||
+ | sudo apt-get install wise (dependency) | ||
+ | wget ftp://genome.crg.es/pub/software/geneid/geneid_v1.4.4.Jan_13_2011.tar.gz (dependency) | ||
+ | tar -xvzf geneid_v1.4.4.Jan_13_2011.tar.gz | ||
+ | cd geneid | ||
+ | make | ||
+ | make install | ||
+ | nano ~/.profile (add $PATH:/data/skyts0401/program/geneid/bin) | ||
+ | cd .. | ||
+ | git clone https://github.com/KorfLab/CEGMA_v2.git | ||
+ | cd CEGMA_v2/ | ||
+ | make | ||
+ | |||
+ | |||
+ | BUSCO | ||
+ | (63:/data/skyts0401/program/) | ||
+ | wget http://bioinf.uni-greifswald.de/augustus/binaries/augustus-3.2.3.tar.gz (dependency) | ||
+ | tar -xvzf augustus-3.2.3.tar.gz | ||
+ | cd augustus-3.2.3/ | ||
+ | make (dependency error) | ||
+ | sudo apt-get install bamtools libbamtools-dev | ||
+ | make | ||
+ | sudo make install | ||
+ | cd .. | ||
+ | git clone https://gitlab.com/ezlab/busco.git | ||
+ | cd busco | ||
+ | sudo python setup.py install | ||
+ | cp config/config.ini.default config/config.ini | ||
+ | nano config.ini (change the august path (path = /data/skyts0401/program/augustus-3.2.3/scripts/, bin/)) | ||
+ | |||
+ | |||
+ | <big>'''Running'''</big> | ||
+ | |||
+ | CEGMA | ||
+ | (63:/data/skyts0401/Mungbean/cegma/) | ||
+ | export CEGMA="/data/skyts0401/program/CEGMA_v2" | ||
+ | export PERL5LIB="$PERL5LIB:$CEGMA/lib" | ||
+ | export PERL5LIB=$CEGMA/lib:$PERL5LIB | ||
+ | source ~/.profile | ||
+ | /data/skyts0401/program/CEGMA_v2/bin/cegma --genome standard_output.gapfilled.final.fa -threads 5 | ||
+ | |||
+ | |||
+ | BUSCO | ||
+ | (63:/data/skyts0401/Mungbean/busco/) | ||
+ | wget http://busco.ezlab.org/datasets/eukaryota_odb9.tar.gz (dataset) | ||
+ | wget http://busco.ezlab.org/datasets/embryophyta_odb9.tar.gz (dataset) | ||
+ | ln -s ../assembly/standard_output.gapfilled.final.fa . | ||
+ | export AUGUSTUS_CONFIG_PATH="/data/skyts0401/program/augustus-3.2.3/config/" | ||
+ | python /data/skyts0401/program/busco/scripts/run_BUSCO.py -i standard_output.gapfilled.final.fa -o Mungbean_busco -c 20 -l eukaryota_odb9/ -m geno | ||
+ | python /data/skyts0401/program/busco/scripts/run_BUSCO.py -i standard_output.gapfilled.final.fa -o Mungbean_plant_busco -c 20 -l embryophyta_odb9/ -m geno | ||
+ | |||
+ | == 5/29 == | ||
+ | === Mungbean pacbio assembly === | ||
+ | Maker | ||
+ | |||
+ | |||
+ | <big>'''Install'''</big> | ||
+ | |||
+ | ncbi-blast+ | ||
+ | (63:/data/skyts0401/program/) | ||
+ | wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.6.0/ncbi-blast-2.6.0+-x64-linux.tar.gz | ||
+ | tar -xvzf ncbi-blast-2.6.0+-x64-linux.tar.gz | ||
+ | |||
+ | |||
+ | exonerate | ||
+ | (63:/data/skyts0401/program/) | ||
+ | git clone https://github.com/nathanweeks/exonerate.git | ||
+ | cd exonerate/ | ||
+ | git checkout v2.4.0 | ||
+ | autoreconf -i | ||
+ | ./configure | ||
+ | make | ||
+ | sudo make install | ||
+ | |||
+ | |||
+ | Maker | ||
+ | (63:/data/skyts0401/program/) | ||
+ | download from http://www.yandell-lab.org/software/maker.html | ||
+ | cd maker/ | ||
+ | cd src/ | ||
+ | nano ~/.profile (add $PATH=RepeatMasker) | ||
+ | source ~/.profile | ||
+ | perl Build.PL | ||
+ | ./Build install | ||
+ | |||
+ | |||
+ | <big>'''Running'''</big> | ||
+ | |||
+ | Preparation | ||
+ | (63:/data/skyts0401/Mungbean/maker/) | ||
+ | (Add PATH(/data/skyts0401/program/maker/bin) to ~/.profile) | ||
+ | ln -s ../assembly/Vradi.pacbio.gapfilled.final.fa . | ||
+ | mkdir ../transcriptome | ||
+ | cd ../transcriptome/ | ||
+ | scp skyts0401@147.46.250.244:/data/KangYJ/Mungbean/Transcriptome/merge/mungbean_merge.fa.cdhit.fa . | ||
+ | cd ../maker/ | ||
+ | ln -s ../transcriptome/mungbean_merge.fa.cdhit.fa . | ||
+ | mkdir ref | ||
+ | cd ref/ | ||
+ | (download Fvesca annotation file from phytozome) | ||
+ | unzip Fvesca_download.zip | ||
+ | cd Fvesca/v1.1/annotation/ | ||
+ | gunzip Fvesca_226_v1.1.protein.fa.gz | ||
+ | gunzip Fvesca_226_v1.1.transcript.fa.gz | ||
+ | cd ../../.. | ||
+ | cp Fvesca/v1.1/annotation/Fvesca_226_v1.1.protein.fa . | ||
+ | cp Fvesca/v1.1/annotation/Fvesca_226_v1.1.transcript . | ||
+ | scp skyts0401@147.46.250.244:/alima9002/ref/forJat/Athaliana_167_TAIR10*.fa | ||
+ | scp skyts0401@147.46.250.244:/alima9002/ref/forJat/Athaliana_167_TAIR10*.fa . | ||
+ | scp skyts0401@147.46.250.244:/alima9002/ref/forJat/Gmax*.fa . | ||
+ | scp skyts0401@147.46.250.244:/alima9002/ref/forJat/Ptrichocarpa*.fa . | ||
+ | scp skyts0401@147.46.250.244:/alima9002/ref/forJat/Vvinifera*.fa . | ||
+ | scp skyts0401@147.46.250.244:/alima9002/ref/forJat/Osativa*.fa . | ||
+ | cd .. | ||
+ | ln -s ../repeatmask/ProtExclude/allRepeats.libnoProtFinal | ||
+ | mkdir tmp | ||
+ | |||
+ | |||
+ | Running | ||
+ | (63:/data/skyts0401/Mungbean/maker/) | ||
+ | maker -CTL | ||
+ | nano maker_bopts.ctl (default, check blast_type=ncbi+) | ||
+ | nano maker_exe.ctl (change the path ncbi-blast+, RepeatMasker, exonerate, augustus) | ||
+ | nano maker_opts.ctl (change the path genome, evidence(transcriptome, protein), repeat library, temporary directory) | ||
+ | mpiexec -n 30 maker -fix_nucleotides maker_opts.ctl maker_bopts.ctl maker_exe.ctl >& maker_opts.ctl.log | ||
+ | |||
+ | == 6/26 == | ||
+ | === Mungbean pacbio assembly === | ||
+ | checking synteny block for chromosome split, combine | ||
+ | |||
+ | |||
+ | blast | ||
+ | (NICEM:~/data/Mungbean/blast) | ||
+ | makeblastdb -in Vradi.ver6.cor.pep.fa -dbtype 'prot' | ||
+ | blastall -i adzuki.ver3.pep.fa.tr.cor.fa -d Vradi.ver6.cor.pep.fa -p blastp -e 1e-10 -b 5 -v 5 -m 8 -o mcscanx/old_Va.blast | ||
+ | # same procedure for other organism protein | ||
+ | |||
+ | |||
+ | MCSanX | ||
+ | (NICEM:~/data/Mungbean/blast/mcscanx) | ||
+ | python gffcombine.py Vradi_ver6.gff.sorted.by.TY.gff adzuki.ver3.gene.gff.cor.gff > old_Va.gff | ||
+ | ~/data/program/MCScanX/MCScanX old_Va | ||
+ | # same procedure for other organism protein, just change the species name in gffcombine.py and command | ||
+ | |||
+ | |||
+ | Circos | ||
+ | (193:/data2/skyts0401/Mungbean/synteny/circos/) | ||
+ | /data2/skyts0401/program/circos-0.69-4/bin/circos -conf synteny_Va_gene.conf -outputfile synteny_Va_gene.png | ||
+ | # same procedure for other organism, change configuration file |
Latest revision as of 06:28, 4 July 2017
Contents |
1 / 9
Minyoung_UV_QTL
parsing genotype data(Joinmap) and phenotype data to ICImapping format(bip format) using lgcombine.py(63:/data/skyts0401/Mungbean/MY_UV/) find linkage group 2 map is wrong, construct map newly
Mungbean synchronous QTL
make loc file(244:/home/skyts0401/reseq/chr/Mungbean_chr_coseq_parse_seg_dist.loc) missing > 10%, hetero > 10, depth < 3 marker is filtered while grouping them, find vr03, vr04 is combined in a group and vr05 is splited 2 groups, check it.
Mungbean pacbio assembly
moving SAM data(align reseq data on pacbio-scaffold) from NICEM server to 244 server (244:/kev8305/SK3/)
1 / 10
Minyoung_UV_QTL
QTL analysis by using IciMapping
Mungbean synchronous QTL
construct genetic map (JoinMap 4.1), just using chr 3, 4 combined and chr 5 splited linkage group.
ML method, Haldane algorithm
Mungbean pacbio assembly
convert SAM format to BAM format (244:/kev8305/SK3/)
./convertbam.sh
1/ 11
Mungbean synchronous QTL
QTL analysis by using RQTL(desktop:/Users/sky/desktop/Mungbean_syn_RQTL.csv) just for checking locus
1/16
Mungbean pacbio assembly
coping sorted.bam file from 244 server to 63 server
variant calling (244:/kev8305/SK3/, 63:/data/skyts0401/Mungbean/mapping/resequencing/)
samtools mpileup -f falcon_500_sspace.final.scaffolds.fasta -v -t DP,AD,ADF,ADR,SP,INFO/AD,INFO/ADF,INFO/ADR -b bam_list | bcftools call -v -m -O v > variants.vcf samtools mpileup -f falcon_500_sspace.final.scaffolds.fasta -I -v -t DP,AD,ADF,ADR,SP,INFO/AD,INFO/ADF,INFO/ADR -b bam_list | bcftools call -v -m -O v > variants_snp.vcf
1/18
Mungbean pacbio assembly
variant calling Kyoungki Jarae #5 with pacbio falcon scaffold (63:/data/skyts0401/Mungbean/mapping/resequencing/)
bwa index falcon_500_sspace.final.scaffolds.fasta bwa mem -t 10 falcon_500_sspace.final.scaffolds.fasta KJ-C_1.fastq.gz KJ-C_2.fastq.gz > KJ-pe_falcon_scaffold.sam
1/19
Mungbean pacbio assembly
variant calling Kyoungki Jarae #5 with pacbio falcon scaffold (63:/data/skyts0401/Mungbean/mapping/resequencing/)
samtools view -Sb KJ-pe_falcon_scaffold.sam > KJ-pe_falcon_scaffold.bam samtools sort KJ-pe_falcon_scaffold.bam -o KJ-pe_falcon_scaffold.sorted.bam samtools index KJ-pe_falcon_scaffold.sorted.bam samtools mpileup -f falcon_500_sspace.final.scaffolds.fasta -I -v -t DP,AD,ADF,ADR,SP,INFO/AD,INFO/ADF,INFO/ADR -u KJ-pe_falcon_scaffold.sorted.bam | bcftools call -v -m -O v > KJ_falcon_scaffold_variants_snp.vcf
1/24
Jatropha assembly
make svg file for superscaffold - linkage group marker location (63:/home/skyts0401/svg/)
python make_chr_lg_svg.py standard_output.final.scaffolds.fasta.tr.JM_out.fa standard_output.final.scaffolds.fasta LG.total.txt.reformed standard_output.final.scaffolds.fasta.tr.JM_out.fa.log > chr_lg.svg
1/31
Mungbean Chloroplast assembly
pairing Illumina PE read (63:/home/skyts0401/)
sudo python PE-pairing.py /data/jungminh/mungbean/PE/SunhwaN_1_cont.fq /data/jungminh/mungbean/PE/SunhwaN_2_cont.fq
2/2
Mungbean Chloroplast assembly
(63:/data/skyts0401/Mungbean/chloroplast/)
gmap_build -D gmap_db -d v.radiata v.radiata.fasta gmap --nosplicing -D gmap_db -n 1 -d v.radiata -f samse scaf_cp_20k.fasta -t 12 | samtools view -Sb > Vr-cp_scaf-cp-20k.bam samtools sort Vr-cp_scaf-cp-20k.bam -o Vr-cp_scaf-cp-20k.sorted.bam samtools index Vr-cp_scaf-cp-20k.sorted.bam
2/3 ~ 2/6
Mungbean Chloroplast assembly
falcon - path : 63:/home/skyts0401/Falcon_RE/rere/
before run, copy fc_env folder (63:/data/skyts0401/Falcon/)
cp -r ~/FALCON_RE/rere/FALCON-integrate/fc_env YOUR_FOLDER
and configure file is on /home/skyts0401/fc_run.cfg
align canu contig_cp file to canu contig_cp assembly (63:/data/skyts0401/Mungbean/chloroplast/)
~/bowtie2-2.2.9/bowtie2-build Vr_cp_canu.contigs.for.mapping.fasta Vr_cp_canu.contigs.for.mapping.fasta ~/bowtie2-2.2.9/bowtie2 -x Vr_cp_canu.contigs.for.mapping.fasta -f canu_ctg_cp.fasta --end-to-end --very-fast -p 20 -S cp-assembly_canu_ctg_revised.sam samtools view -Sb cp-assembly_canu-ctg.sam > cp-assembly_canu-ctg.bam samtools sort cp-assembly_canu-ctg.bam -o cp-assembly_canu-ctg.sorted.bam samtools index cp-assembly_canu-ctg.sorted.bam samtools faidx canu_ctg_cp.fasta .... ~/bowtie2-2.2.9/bowtie2 -x Vr_cp_canu.contigs.for.mapping.fasta -f pb.cp.fasta --end-to-end --very-fast -p 20 -S cp-assembly_pb-cp.sam .... ~/bowtie2-2.2.9/bowtie2 -x Vr_cp_canu.contigs.for.mapping.fasta -1 SunhwaN_1_cont.fq.pairing.fq -2 SunhwaN_2_cont.fq.pairing.fq --end-to-end --very-fast -p 20 -S cp-assembly_PE-cp.sam
2/6
Mungbean Chloroplast assembly
assembly (canu) mungbean pacbio corrected read for chloroplast, parameter changed (63:/data/skyts0401/Mungbean/chloroplast/)
~/canu/Linux-amd64/bin/canu -assemble -p cp_read5 -d assembly/cp_read5 genomeSize=154k contigFilter="5 1000 0.75 0.75 2" -pacbio-corrected pb.cp.fasta ~/canu/Linux-amd64/bin/canu -assemble -p cp_read10 -d assembly/cp_read10 genomeSize=154k contigFilter="10 1000 0.75 0.75 2" -pacbio-corrected pb.cp.fasta
and we have 2 contigs (one contig have LSC+IR, and other contig have SSC+IR)
just assembly them(cp_1.fa, cp_2.fa, cp_3.fa)
2/9
Mungbean Chloroplast assembly
quiver(GenomicConsensus) install(63:/data/kev8305/skyts0401/program)
--- boost (ConsensusCore dependency) --- wget https://sourceforge.net/projects/boost/files/boost/1.63.0/boost_1_63_0.tar.gz tar -xf boost1_63_0.tar.gz cd boost_1_63_0/ ./bootstrap.sh sudo apt-get install python-dev (solution for error-pyconfig.h) sudo ./b2 install
--- swig (ConsensusCore dependency) --- wget https://downloads.sourceforge.net/project/swig/swig/swig-3.0.12/swig-3.0.12.tar.g tar -xf swig-3.0.12.tar.gz cd swig-3.0.12/ ./configure make sudo make install
--- ConsensusCore (GenomicConsensus dependency) --- git clone https://github.com/PacificBiosciences/ConsensusCore.git cd ConsensusCore/ sudo python setup.py install
--- GenomicConsensus --- git clone https://github.com/PacificBiosciences/GenomicConsensus.git sudo apt-get install libhdf5-serial-dev (solution for error-hdf5.h) sudo make
Align PacBio_chloroplast read to vr.pb.cp.fasta(PacBio cp assembly) (63:/kev8305/Mungbean_assembly/chloroplast/)
bowtie2 -x vr.pb.cp.fasta -f pb.cp.fasta --end-to-end --very-fast -p 4 -S cp-assembly_pb-cp.sam samtools view -Sb cp-assembly_pb-cp.sam > cp-assembly_pb-cp.bam
Align Illumina Paired-End read to vr.pb.cp.fasta(PacBio cp assembly) (63:/kev8305/Mungbean_assembly/chloroplast/)
bowtie2 -x vr.pb.cp.fasta -1 SunhwaN_1_cont.fq.pairing.fq -2 SunhwaN_2_cont.fq.pairing.fq --end-to-end --very-fast -p 4 -S cp-assembly_PE-cp.sam samtools view -Sb cp-assembly_PE-cp.sam > cp-assembly_PE-cp.bam
Polishing by Quiver
2/10
Mungbean Chlroplast assembly
Quiver aligning Pacbio_chlroplast read to vr.pb.cp.fasta need to use pbalign, not bowtie or some other program.
pbalign install (63:/kev8305/skyts0401/program)
--- blasr (pbalign dependency) --- https://github.com/PacificBiosciences/blasr/blob/master/doc/INSTALL_MAKE.md
--- pbcommand (quiver dependency) --- git clone https://github.com/PacificBiosciences/pbcommand.git cd pbcommand sudo python setup.py install
--- pbalign --- git clone https://github.com/PacificBiosciences/pbalign.git cd pbalign/ sudo pip install .
pbalign (tried to align by using blasr algorithm , but sam or bam is no longer supported in blasr, so just use bowtie algorithm) (63:/kev8305/Mungbean_assembly/chloroplast/)
pbalign --noSplitSubreads --nproc 4 --algorithm bowtie pb.cp.fasta vr.pb.cp.fasta cp-assembly-pb-cp.for.quiver.sam
2/13
Mungbean Chloroplast assembly
Error occured while pbalign, so re-installed blasr(guess library error)
2/14
Mungbean Chloroplast assembly
variant calling with PE and PB read on chloroplast assembly genome (63:/kev8305/Mungbean_assembly/chloroplast/)
samtools mpileup -f vr.pb.cp.fasta -v -t DP,AD,ADF,ADR,SP,INFO/AD,INFO/ADF,INFO/ADR -u cp-assembly_pb-cp.sorted.bam | bcftools call -v -m -O v > vr.cp_pb_variants.vcf samtools mpileup -f vr.pb.cp.fasta -v -t DP,AD,ADF,ADR,SP,INFO/AD,INFO/ADF,INFO/ADR -u cp-assembly_PE-cp.sorted.bam | bcftools call -v -m -O v > vr.cp_PE_variants.vcf
2/16
Mungbean Chloroplast assembly
Align PE reads to vr.pb.cp.fasta by using bwa (244:/kev8305/Mungbean_assembly/chloroplast/)
bwa index vr.pb.cp.fasta bwa mem -t 4 vr.pb.cp.fasta SunhwaN_1_cont.fq.pairing.fq SunhwaN_2_cont.fq.pairing.fq > vr.pb.cp_PE.sam samtools view -Sb vr.pb.cp_PE.sam > vr.pb.cp_PE.bam samtools sort vr.pb.cp_PE.bam -o vr.pb.cp_PE.sorted.ba samtools index vr.pb.cp_PE.sorted.bam samtools mpileup -f vr.pb.cp.fasta -v -t DP,AD,ADF,ADR,SP,INFO/AD,INFO/ADF,INFO/ADR -u vr.pb.cp_PE.sorted.bam | bcftools call -v -m -O v > variants_PE_bwa.vcf .... bwa mem -t 4 vr.pb.cp.fasta pb.cp.fasta > vr.pb.cp_PB.sam samtools view -Sb vr.pb.cp_PB.sam > vr.pb.cp_PB.bam samtools sort vr.pb.cp_PB.bam -o vr.pb.cp_PB.sorted.bam samtools index vr.pb.cp_PB.sorted.bam .... samtools mpileup -f vr.pb.cp.fasta -v -t DP,AD,ADF,ADR,SP,INFO/AD,INFO/ADF,INFO/ADR -u vr.pb.cp_PE.sorted.bam > variants_PE_bwa_all.vcf python vcf_filtering.py variants_PE_bwa_all.vcf > variants_PE_bwa_all_0.15.vcf
2/21
Mungbean Chloroplast assembly
make a code that read fasta and annotation file(gff or gb) and make a fasta file with gene CDS sequence (63:/kev8305/Mungbean_assembly/chloroplast/)
python getCDS.py vr.pb.cp.fasta vr.pb.cp.gff > vr.pb.cp.gene.fasta python getCDS.py v.radiata.fasta v.radiata.gb > v.radiata.gene.fasta
2/22
Mungbean pacbio assembly
snp calling done, snp filtering for genetic map construction (244:/kev8305/SK3/)
python ~/reseq/vcfparse_parent.py variants_snp.vcf KJ_falcon_scaffold_variants_snp.vcf python ~/reseq/vcfparse.py variants_snp_compare_parents.vcf (dp >= 5, missing < 13, hetero < 10) python ~/reseq/vcfparse_coseg.py variants_snp_compare_parents_filtered.vcf Mungbean_pacbio_scaffold_3.loc python locparse.py Mungbean_pacbio_scaffold_3_seg_dist.loc > Mungbean_pacbio_scaffold_3_seg_dist_format.loc (scaffold name is too long, eliminate '|')
2/23
Mungbean pacbio assembly
too many snp for joinmap, so filtering missing < 12
python ~/reseq/vcfparse.py variants_snp_compare_parents.vcf python ~/reseq/vcfparse_coseg.py variants_snp_compare_parents_filtered.vcf Mungbean_pacbio_scaffold_4.loc python ~/reseq/cal_seg_dist.py Mungbean_pacbio_scaffold_4.loc 9110 python locparse.py Mungbean_pacbio_scaffold_4_seg_dist.loc > Mungbean_pacbio_scaffold_4_seg_dist_format.loc
2/27
Mungbean pacbio assembly
Mugbean_pacbio_scaffold_7_seg_dist_foramt.loc : no hetero, missing < 18
ALLMAPS install (244:/kev8305/skyts0401/program)
easy_install biopython numpy deap networkx matplotlib jcvi wget https://dl.dropboxusercontent.com/u/15937715/Data/ALLMAPS/ALLMAPS-install.sh sh ALLMAPS-install.sh
and, add directory include ALLMAPS binnary code(concorde,faSize,liftOver) to $PATH in ~/.profile
ALLMAPS (244:/kev8305/SK3/anchoring)
python ~/reseq/allmaps_format.py Mungbean_pacbio_5_joinmap.result > Mungbean_pacbio_5_joinmap.for.allmaps python ~/reseq/allmaps_format.py Mungbean_pacbio_7_joinmap.result > Mungbean_pacbio_7_joinmap.for.allmaps python -m jcvi.assembly.allmaps merge Mungbean_pacbio_5_joinmap.for.allmaps Mungbean_pacbio_7_joinmap.for.allmaps -o JM-2.bed python -m jcvi.assembly.allmaps path JM-2.bed falcon_500_sspace.final.scaffolds.fasta.header.fasta
3/2
Mungbean pacbio assembly
MUMmer install, for dot plot between pacbio and previous ref
wget https://downloads.sourceforge.net/project/mummer/mummer/3.23/MUMmer3.23.tar.gz tar -xvf MUMmer3.23.tar.gz cd MUMmer3.23 make check make install MUMmer3.23/mummer -mum -b -c Vradi.ver6.cor.fa.chr.fa JM-2.chr.fasta > ref_qry.mums
3/3
Mungbean pacbio assembly
lastz install (244:/kev8305/skyts0401/program/)
download from http://www.bx.psu.edu/~rsharris/lastz/ tar -xvzf lastz-1.02.00.tar.gz cd lastz-distrib-1.02.00/src/ ---------------------------------- problem with Makefile, so delete -Werror in line 31 of Makefile, save. ---------------------------------- make make install
add path /home/skyts0401/lastz-distrib/bin in .profile
lastz (244:/kev8305/SK3/anchoring/)
lastz JM-2.chr.fasta[multiple] Vradi.ver6.cor.fa --notransition --step=20 --gfextend --chain --gapped --format=sam > old_new.sam
3/7
Mungbean pacbio assembly
MUMmer, having a problem with memory, was re-installed with a memory configuration
make clean make CPPFLAGS="-O3 -DSIXTYFOURBITS" make install
and use nucmer to align pacbio assembly and previous reference
MUMmer3.23/nucmer -maxmatch -c 100 -p ref_qry JM-2.chr.fasta Vradi.ver6.cor.fa MUMmer3.23/nucmer --noextend -c 100 -p ref_qry_noextend JM-2.chr.fasta Vradi.ver6.cor.fa
and draw a dot plot using mummerplot
mummerplot --fat -l -png ref_qry_noextend.delta
but it occurs a error like
set mouse clipboardformat "[%.0f, %.0f]" ^ "out.gp", line 2594: wrong option
It seems gnuplot was updated, so doesn't support that option resulted from mummerplot. just edit out.gp to delete that line.
/kev8305/skyts0401/program/last-842/scripts/last-dotplot -2 'Vr*' -2 'scaffold_?' -x 1920 -y 1920 ref_qry.maf plot.png
3/16 ~
Mungbean pacbio assembly
compare between pacbio assembly and previous reference
1. 50 reseq marker/LG on previous reference mapping on pacbio super scaffold for checking same marker is on same chromosome. (244:/kev8305/SK3/anchoring/check)
python SNP_marker_pos.py Vradi_ver6.fa Mungbean_chr_coseg_parse_seg_dist.loc > Vradi.ver6.reseq.marker.fasta makeblastdb -in JM-2.chr.fasta -dbtype 'nucl' -out Mungbean_pacbio blastn -db Mungbean_pacbio -query Vradi.ver6.reseq.marker.fasta -outfmt 6 -out reseq_marker.blast -num_threads 2 -evalue 1e-5 -word_size 100 python blastparse.py reseq_marker.blast > reseq_marker_for_svg.result python chr_compare_svg.py fasta.size reseq_marker_for_svg.result > chr_compare_3.svg (output can be changed based on option in python code)
/data2/skyts0401/program/circos-0.69-4/bin/circos -conf chr_compare.conf (193:/data2/skyts0401/check/circos)
2. contig compare. (63:/data/skyts0401/Mungbean/assembly/)
scp assembly@147.46.250.181:/home/assembly/data/Mungbean/mapping/p_ctg.longest.fa . scp skyts0401@147.46.250.244:/kev8305/SK3/anchoring/final.contigs.longest100.fa . gmap_build -d pacbio_contig_new p_ctg.longest.fa -D ./ gmap -d pacbio_contig_new -D pacbio_contig_new/ final.contigs.longest100.fa -t 12 -f 1 > pacbio_contig_compare.psl -------------------------------------------------------------------------------------- (NICEM:/home/assembly/check/) ../bwa-0.7.15/bwa mem -t 30 p_ctg.longest.longest3.fa SunhwaN_1.fastq.gz SunhwaN_2.fastq.gz > newcontig_illumina.sam (244:/kev8305/SK3/anchoring/check/) ln -s /NGS/NGS/VignaRadiata/DNA/Sunhwa_pacbio/filtered_subreads.fasta . bwa index p_ctg.longest.longest1.fa bwa mem -t 8 p_ctg.longest.longest1.fa filtered_subreads.fasta > newcontig_pacbio.sam samtools view -Sb newcontig_pacbio.sam > newcontig_pacbio.bam samtools sort newcontig_pacbio.bam -o newcontig_pacbio.sorted.bam samtools index newcontig_pacbio.sorted.bam ~ same samtools command with newcontig_illumina.sam ~
Find that something looked splited mapping, so re-align with end-to-end method of bowtie2
(NICEM:~/check/) ~/bowtie2-2.2.9/bowtie2-build p_ctg.longest.longest1.fa p_ctg.longest.longest1.fa ~/bowtie2-2.2.9/bowtie2 -x p_ctg.longest.longest1.fa -1 SunhwaN_1.fastq.gz -2 SunhwaN_2.fastq.gz --end-to-end --very-fast -p 30 -S newcontig_illumina_endtoend.sam ~/bowtie2-2.2.9/bowtie2 -x p_ctg.longest.longest1.fa -f filtered_subreads.fasta --end-to-end --very-fast -p 30 -S newcontig_pacbio_endtoend.sam !!!bowtie2-2.3.0 version has a bug!!!
(244:/kev8305/SK3/anchoring/check/) scp assembly@147.46.250.181:/home/assembly/check/newcontig_pacbio_endtoend.sam . scp assembly@147.46.250.181:/home/assembly/check/newcontig_illumina_endtoend.sam . ~ same samtools command, view, sort, index ~ samtools depth -a -q 0 -Q 0 -r 000000F:2000000-4000000 newcontig_illumina_endtoend.sorted.bam > newcontig_illumina_endtoend.mapping.depth samtools depth -a -q 0 -Q 0 -r 000000F:2000000-4000000 newcontig_pacbio_endtoend.sorted.bam > newcontig_pacbio_endtoend.mapping.depth
blat for comparing contig (NICEM:/home/assembly/check/, 244:/kev8305/SK3/anchoring/check/)
------------------------------ (contig_compare.sh) #!/bin/bash for i in {0..19}; do ../blat p_ctg.longest.longest3.fa final.contigs_devide${i}.fa contig_compare_all_${i}.psl & done wait ------------------------------
(NICEM) python fasta_devide.py final.contigs.reformed.fasta chmod a+x contig_compare.sh ./contig_compare.sh ls contig_compare_all_*.psl > psl.list nano pslfilter.py python pslfilter.py psl.list > conitg_compare_all.result python pslfilter2.py contig_compare_all.result > contig_compare_all_filtered.result
4/18
Jatropha assembly
make Jatropha figure(chr - lg) for new version(allmaps) (244:/kev8305/skyts0401/Jatropha)
scp skyts0401@147.46.250.63:/home/skyts0401/svg/make_chr_lg_svg.py make_chr_lg_svg_revised_for_allmaps.py python make_chr_lg_svg_revised_for_allmaps.py Jatropha_map1.result Jatropha.allmaps.agp > Jatropha_chr_lg.svg
4/26
Mungbean pacbio assembly
mungbean super scaffold (JM-2.fasta) was gap filled. Final assembly Fasta is in /kev8305/SK3/anchoring/gapfilled_assembly_final/
5/1
Mungbean pacbio assembly
Repeat masking progress is based on these sites: http://weatherby.genetics.utah.edu/MAKER/wiki/index.php/Repeat_Library_Construction-Advanced, http://www.repeatmasker.org/
Repeat masking program installation
Repbase - for RepeatMasker
(63:/data/skyts0401/program/) should register http://www.girinst.org/ download RepBaseRepeatMaskerEdition cp RepBaseRepeatMaskerEdition-20170127\ \(1\).tar.gz RepeatMasker/. cd RepeatMasker/ tar -xvzf RepBaseRepeatMaskerEdition-20170127\ \(1\).tar.gz (Libraries/ diretory will be created and all file will be copied to RepeatMasker/Libraries/)
rmblast - for RepeatMasker (ver 2.6.0 has problem with install, so I installed v. 2.2.28)
(63:/data/skyts0401/program/) download from http://www.repeatmasker.org/RMBlast.html wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/rmblast/2.2.28/ncbi-rmblastn-2.2.28-x64-linux.tar.gz wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.2.28/ncbi-blast-2.2.28+-x64-linux.tar.gz tar zxvf ncbi-blast-2.2.28+-x64-linux.tar.gz tar zxvf ncbi-rmblastn-2.2.28-x64-linux.tar.gz cp -R ncbi-rmblastn-2.2.28/* ncbi-blast-2.2.28+/ rm -rf ncbi-rmblastn-2.2.28 mv ncbi-blast-2.2.28+ rmblast-2.2.28
trf - for RepeatMasker
(63:/data/skyts0401/program/) download from http://tandem.bu.edu/trf/trf.html chmod a+x trf409.linux64 ln -s trf409.linux64 RepeatMasker/trf
RepeatMasker
(63:/data/skyts0401/program/) wget http://www.repeatmasker.org/RepeatMasker-open-4-0-7.tar.gz tar -xzf RepeatMasker-open-4-0-7.tar.gz cd RepeatMasker/ (move the Repbase library to RepeatMasker/Libraries/) perl ./configure configure directory of trf, rmblast
muscle - for MITE-Hunter
(63:/data/skyts0401/program/) check version on http://www.drive5.com/muscle/ wget http://www.drive5.com/muscle/downloads3.8.31/muscle3.8.31_i86linux64.tar.gz tar -xvzf muscle3.8.31_i86linux64.tar.gz mkdir muscle muscle3.8.31_i86linux64 muscle/
mdust - for MITE-Hunter
(63:/data/skyts0401/program/) wget ftp://occams.dfci.harvard.edu/pub/bio/tgi/software//seqclean/mdust.tar.gz tar -xvzf mdust.tar.gz
MITE-Hunter
(63:/data/skyts0401/program/) check version on http://target.iplantcollaborative.org/mite_hunter.html wget http://target.iplantcollaborative.org/mite_hunter/MITE%20Hunter-11-2011.zip unzip MITE\ Hunter-11-2011.zip mv MITE\ Hunter/ MITE_Hunter cd MITE_Hunter/ perl MITE_Hunter_Installer.pl -d /data/skyts0401/program/MITE\ Hunter -f formatdb -b blastall -m /data/skyts0401/program/mdsut -M /data/skyts0401/program/muscle
GenomeTools
(63:/data/skyts0401/program/) check version on http://genometools.org/ wget http://genometools.org/pub/genometools-1.5.9.tar.gz tar -xvzf genometools-1.5.9.tar.gz cd genometools-1.5.9/ make sudo make install - if have a problem with dependency, please check this - sudo apt-get install libcairo2-dev sudo apt-get install libpango1.0-dev
Genome tRNA database
(63:/home/skyts0401/bin/) check version on http://gtrnadb.ucsc.edu wget http://gtrnadb2009.ucsc.edu/download/tRNAs/eukaryotic-tRNAs.fa.gz gunzip eukaryotic-tRNAs.fa.gz
CRL scripts
(63:/home/skyts0401/bin/) wget http://www.hrt.msu.edu/uploads/535/78637/CRL_Scripts1.0.tar.gz tar -xvzf CRL_Scripts1.0.tar.gz
transposons protein database
(63:/home/skyts0401/bin/) wget http://www.hrt.msu.edu/uploads/535/78637/Tpases020812DNA.gz gunzip Tpases020812DNA.gz wget http://www.hrt.msu.edu/uploads/535/78637/Tpases020812.gz gunzip Tpases020812.gz
plant protein database
(63:/home/skyts0401/bin/) wget http://www.hrt.msu.edu/uploads/535/78637/alluniRefprexp070416.gz gunzip alluniRefprexp070416.gz
RECON - for RepeatModeler
(63:/data/skyts0401/program/) wget http://www.repeatmasker.org/RepeatModeler/RECON-1.08.tar.gz tar -xvzf RECON-1.08.tar.gz cd RECON-1.08/src/ make make install cd ../scripts/ nano recon.pl (added /data/skyts0401/program/RECON-1.08/bin to PATH = "" (third line))
RepeatScout - for RepeatModeler
(63:/data/skyts0401/program/) wget http://www.repeatmasker.org/RepeatScout-1.0.5.tar.gz tar -xvzf RepeatScout-1.0.5.tar.gz cd RepeatScout-1/ make sudo make install
nseg - for RepeatModeler
(63:/data/skyts0401/program/) mkdir nseg cd nseg wget ftp://ftp.ncbi.nih.gov/pub/seg/nseg/* . make
RepeatModeler
(63:/data/skyts0401/program/) wget http://www.repeatmasker.org/RepeatModeler/RepeatModeler-open-1.0.9.tar.gz tar -xvzf RepeatModeler-open-1.0.9.tar.gz cd RepeatModeler-open-1.0.9/ perl ./configure configure directory of RECON, RepeatScout, nseg, trf, rmblast
hmmer - for ProtExcluder
(63:/data/skyts0401/program/) wget http://eddylab.org/software/hmmer3/3.1b2/hmmer-3.1b2-linux-intel-x86_64.tar.gz tar -xvzf hmmer-3.1b2-linux-intel-x86_64.tar.gz cd hmmer-3.1b2-linux-intel-x86_64/ ./configure make sudo make install
ProtExcluder
wget http://www.hrt.msu.edu/uploads/535/78637/ProtExcluder1.2.tar.gz tar -xvzf ProtExcluder1.2.tar.gz cd ProtExcluder1.2/ ./Installer.pl -m /data/skyts0401/program/hmmer-3.1b2-linux-intel-x86_64/binaries/ -p /data/skyts0401/program/ProtExcluder1.2/
Repeat masking progress
Basic command which I used is based on http://weatherby.genetics.utah.edu/MAKER/wiki/index.php/Repeat_Library_Construction-Advanced
Move Mungbean genome assembly final version
scp skyts0401@147.46.250.244:/kev8305/SK3/anchoring/gapfilled_assembly_final/standard_output.gapfilled.final.fa .
MITE library
(63:/data/skyts0401/program/MITE_Hunter/) perl MITE_Hunter_manager.pl -i /data/skyts0401/Mungbean/assembly/standard_output.gapfilled.final.fa -g Mungbean -c 10 -S 12345678 mv Mungbean* /data/skyts0401/Mungbean/repeatmask/MITE/. cd /data/skyts0401/Mungbean/repeatmask/MITE/ cat Mungbean_Step8_*.fa > ../MITE.lib
LTR library
(63:/data/skyts0401/Mungbean/repeatmask/LTR/99/) ln -s /data/skyts0401/Mungbean/assembly/standard_output.gapfilled.final.fa . gt suffixerator -db standard_output.gapfilled.final.fa -indexname Mungbean_LTR -tis -suf -lcp -des -ssp -dna gt ltrharvest -index Mungbean_LTR -out Mungbean.out99 -outinner Mungbean.outinner99 -gff3 Mungbean.gff99 -minlenltr 100 -maxlenltr 6000 0ministltr 1500 -maxdistltr 25000 -mintsd 5 -maxtsd 5 -motif tgca -similar 99 -vic 10 > Mungbean.result99 gt gff3 -sort Mungbean.gff99 > Mungbean.gff99.sort gt ltrdigest -trnas ~/bin/eukaryotic-tRNAs.fa Mungbean.gff99.sort Mungbean_LTR > Mungbean.gff99.dgt perl ~/bin/CRL_Scripts1.0/CRL_Step1.pl --gff Mungbean.gff99.dgt perl ~/bin/CRL_Scripts1.0/CRL_Step2.pl --step1 CRL_Step1_Passed_Elements.txt --repeatfile Mungbean.out99 --resultfile Mungbean.result99 --sequencefile standard_output.gapfilled.final.fa --removed_repeats CRL_Step2_Passed_Elements.fasta mkdir fasta_files mv Repeat_*.fasta fasta_files/\ mv Repeat_*.fasta fasta_files/ mv CRL_Step2_Passed_Elements.fasta fasta_files/ cd fasta_files/ perl ~/bin/CRL_Scripts1.0/CRL_Step3.pl --directory . --step2 CRL_Step2_Passed_Elements.fasta --pidentity 60 --seq_c 25 mv CRL_Step3_Passed_Elements.fasta .. cd .. perl ~/bin/CRL_Scripts1.0/ltr_library.pl --resultfile Mungbean.result99 --step3 CRL_Step3_Passed_Elements.fasta --sequencefile standard_output.gapfilled.final.fa cp lLTR_Only.lib ../lLTR_Only_99.lib cat lLTR_Only.lib ../../MITE/MITE.lib > repeats_to_mask_LTR99.fasta /data/skyts0401/program/RepeatMasker/RepeatMasker -lib repeats_to_mask_LTR99.fasta -nolow -dir . Mungbean.outinner99 perl ~/bin/CRL_Scripts1.0/cleanRM.pl Mungbean.outinner99.out Mungbean.outinner99.masked > Mungbean.outinner99.unmasked perl ~/bin/CRL_Scripts1.0/rmshortinner.pl Mungbean.outinner99.unmasked 50 > Mungbean.outinner99.clean makeblastdb -in ~/bin/Tpases020812DNA -dbtype prot blastx -query Mungbean.outinner99.clean -db ~/bin/Tpases020812DNA -evalue 1e-10 -num_descriptions 10 -out Mungbean.outinner99.clean_blastx.out.txt perl ~/bin/CRL_Scripts1.0/outinner_blastx_parse.pl --blastx Mungbean.outinner99.clean_blastx.out.txt --outinner Mungbean.outinner99 perl ~/bin/CRL_Scripts1.0/CRL_Step4.pl --step3 CRL_Step3_Passed_Elements.fasta --resultfile Mungbean.result99 --innerfile passed_outinner_sequence.fasta --sequencefile standard_output.gapfilled.final.fa makeblastdb -in lLTRs_Seq_For_BLAST.fasta -dbtype nucl blastn -query lLTRs_Seq_For_BLAST.fasta -db lLTRs_Seq_For_BLAST.fasta -evalue 1e-10 -num_descriptions 1000 -out lLTRs_Seq_For_BLAST.fasta.out makeblastdb -in Inner_Seq_For_BLAST.fasta -dbtype nucl blastn -query Inner_Seq_For_BLAST.fasta -db Inner_Seq_For_BLAST.fasta -evalue 1e-10 -num_descriptions 1000 -out Inner_Seq_For_BLAST.fasta.out perl ~/bin/CRL_Scripts1.0/CRL_Step5.pl --LTR_blast lLTRs_Seq_For_BLAST.fasta.out --inner_blast Inner_Seq_For_BLAST.fasta.out --step3 CRL_Step3_Passed_Elements.fasta --final LTR99.lib --pcoverage 90 --pidentity 80
relatively old LTR (Same command with above one, but for relatively old LTR)
(63:/data/skyts0401/Mungbean/repeatmask/LTR/85) (to avoid confuse LTR_99 with this results, make directory 99 and 85 in LTR directory) ln -s /data/skyts0401/Mungbean/assembly/standard_output.gapfilled.final.fa . gt suffixerator -db standard_output.gapfilled.final.fa -indexname Mungbean_LTR -tis -suf -lcp -des -ssp -dna gt ltrharvest -index Mungbean_LTR -out Mungbean.out85 -outinner Mungbean.outinner85 -gff3 Mungbean.gff85 -minlenltr 100 -maxlenltr 6000 -mindistltr 1500 -maxdistltr 25000 -mintsd 5 -maxtsd 5 -vic 10 > Mungbean.result85 cp ../99/CRL_Step1_Passed_Elements.txt . perl ~/bin/CRL_Scripts1.0/CRL_Step2.pl --step1 CRL_Step1_Passed_Elements.txt --repeatfile Mungbean.out85 --resultfile Mungbean.result85 --sequencefile standard_output.gapfilled.final.fa --removed_repeats CRL_Step2_Passed_Elements.fasta mkdir fasta_files mv Repeat_*.fasta fasta_files/ mv CRL_Step2_Passed_Elements.fasta fasta_files/ cd fasta_files/ perl ~/bin/CRL_Scripts1.0/CRL_Step3.pl --directory . --step2 CRL_Step2_Passed_Elements.fasta --pidentity 60 --seq_c 25 mv CRL_Step3_Passed_Elements.fasta .. cd .. perl ~/bin/CRL_Scripts1.0/ltr_library.pl --resultfile Mungbean.result85 --step3 CRL_Step3_Passed_Elements.fasta --sequencefile standard_output.gapfilled.final.fa cp lLTR_Only.lib ../lLTR_Only_85.lib cat lLTR_Only.lib ../../MITE/MITE.lib > repeats_to_mask_LTR85.fasta /data/skyts0401/program/RepeatMasker/RepeatMasker -lib repeats_to_mask_LTR85.fasta -nolow -dir . Mungbean.outinner85 perl ~/bin/CRL_Scripts1.0/cleanRM.pl Mungbean.outinner85.out Mungbean.outinner85.masked > Mungbean.outinner85.unmasked perl ~/bin/CRL_Scripts1.0/rmshortinner.pl Mungbean.outinner85.unmasked 50 > Mungbean.outinner85.clean makeblastdb -in ~/bin/Tpases020812DNA -dbtype prot blastx -query Mungbean.outinner85.clean -db ~/bin/Tpases020812DNA -evalue 1e-10 -num_descriptions 10 -out Mungbean.outinner85.clean_blastx.out.txt perl ~/bin/CRL_Scripts1.0/outinner_blastx_parse.pl --blastx Mungbean.outinner85.clean_blastx.out.txt --outinner Mungbean.outinner85 perl ~/bin/CRL_Scripts1.0/CRL_Step4.pl --step3 CRL_Step3_Passed_Elements.fasta --resultfile Mungbean.result85 --innerfile passed_outinner_sequence.fasta --sequencefile standard_output.gapfilled.final.fa makeblastdb -in lLTRs_Seq_For_BLAST.fasta -dbtype nucl blastn -query lLTRs_Seq_For_BLAST.fasta -db lLTRs_Seq_For_BLAST.fasta -evalue 1e-10 -num_descriptions 1000 -out lLTRs_Seq_For_BLAST.fasta.out makeblastdb -in Inner_Seq_For_BLAST.fasta -dbtype nucl blastn -query Inner_Seq_For_BLAST.fasta -db Inner_Seq_For_BLAST.fasta -evalue 1e-10 -num_descriptions 1000 -out Inner_Seq_For_BLAST.fasta.out perl ~/bin/CRL_Scripts1.0/CRL_Step5.pl --LTR_blast lLTRs_Seq_For_BLAST.fasta.out --inner_blast Inner_Seq_For_BLAST.fasta.out --step3 CRL_Step3_Passed_Elements.fasta --final LTR85.lib --pcoverage 90 --pidentity 8 /data/skyts0401/program/RepeatMasker/RepeatMasker -lib ../99/LTR99.lib -dir . LTR85.lib perl ~/bin/CRL_Scripts1.0/remove_masked_sequence.pl --masked_elements LTR85.lib.masked --outfile FinalLTR85.lib cd .. cat 99/LTR99.lib 85/FinalLTR85.lib > allLTR.lib
Collecting repetitive sequences
ln -s /data/skyts0401/Mungbean/assembly/standard_output.gapfilled.final.fa . nano fasta_devide.py python fasta_devide.py standard_output.gapfilled.final.fa nano repeatmask_combine.sh chmod a+x repeatmask_combine.sh ./repeatmask_combine.sh cat standard_output.gapfilled.final_devide*.fa.masked > standard_output.gapfilled.final.fa.masked perl ~/bin/CRL_Scripts1.0/rmaskedpart.pl standard_output.gapfilled.final.fa.masked 50 > umseqfile /data/skyts0401/program/RepeatModeler-open-1.0.9/BuildDatabase -name umseqfildeb -engine ncbi umseqfile nohup /data/skyts0401/program/RepeatModeler-open-1.0.9/RepeatModeler -database umseqfiledb >& umseqfile.out perl ~/bin/CRL_Scripts1.0/repeatmodeler_parse.pl --fastafile consensi.fa.classified --unknowns repeatmodeler_unknowns.fasta --identities repeatmodeler_identities.fasta makeblastdb -in ~/bin/Tpases020812 -dbtype prot blastx -query repeatmodeler_unknowns.fasta -db ~/bin/Tpases020812 -evalue 1e-10 -num_descriptions 10 -out modelerunknown_blast_result.txt ~/bin/CRL_Scripts1.0/transposon_blast_parse.pl --blastx modelerunknown_blast_result.txt --modelerunknown repeatmodeler_unknowns.fasta mv unknown_elements.txt ModelerUnknown.lib cat identified_elements.txt repeatmodeler_identities.fasta > ModelerID.lib
Exclusion of gene fragments
makeblastdb -in ~/bin/alluniRefprexp070416 -dbtype prot blastx -query ModelerUnknown.lib -db ~/bin/alluniRefprexp070416 -evalue 1e-10 -num_descriptions 10 -out ModelerUnknown.lib_blast_result.txt cd LTR/ python headerforamt.py allLTR.lib > allLTR.lib.reformed (LTR library has '(' symbol, resulting in ProtExcluder error, so change the format) cd .. mkdir ProtExclude cd ProtExclude/ cp ../MITE/MITE.lib . cp ../LTR/allLTR.lib.reformed . cp ../ModelerID.lib . cp ../ModelerUnknown.lib . cat allLTR.lib.reformed MITE.lib ModelerID.lib > KnownRepeats.lib cat KnownRepeats.lib ModelerUnknown.lib > allRepeats.lib blastx -query allRepeats.lib -db ~/bin/alluniRefprexp070416 -evalue 1e-10 -num_descriptions 10 -out allRepeats.lib_blast_results.txt /data/skyts0401/program/ProtExcluder1.2/ProtExcluder.pl allRepeats.lib_blast_results.txt allRepeats.lib
5/26
Mungbean pacbio assembly
For assessment of assembly, run CEGMA and BUSCO
Install
CEGMA
(63:/data/skyts0401/program/) sudo apt-get install wise (dependency) wget ftp://genome.crg.es/pub/software/geneid/geneid_v1.4.4.Jan_13_2011.tar.gz (dependency) tar -xvzf geneid_v1.4.4.Jan_13_2011.tar.gz cd geneid make make install nano ~/.profile (add $PATH:/data/skyts0401/program/geneid/bin) cd .. git clone https://github.com/KorfLab/CEGMA_v2.git cd CEGMA_v2/ make
BUSCO
(63:/data/skyts0401/program/) wget http://bioinf.uni-greifswald.de/augustus/binaries/augustus-3.2.3.tar.gz (dependency) tar -xvzf augustus-3.2.3.tar.gz cd augustus-3.2.3/ make (dependency error) sudo apt-get install bamtools libbamtools-dev make sudo make install cd .. git clone https://gitlab.com/ezlab/busco.git cd busco sudo python setup.py install cp config/config.ini.default config/config.ini nano config.ini (change the august path (path = /data/skyts0401/program/augustus-3.2.3/scripts/, bin/))
Running
CEGMA
(63:/data/skyts0401/Mungbean/cegma/) export CEGMA="/data/skyts0401/program/CEGMA_v2" export PERL5LIB="$PERL5LIB:$CEGMA/lib" export PERL5LIB=$CEGMA/lib:$PERL5LIB source ~/.profile /data/skyts0401/program/CEGMA_v2/bin/cegma --genome standard_output.gapfilled.final.fa -threads 5
BUSCO
(63:/data/skyts0401/Mungbean/busco/) wget http://busco.ezlab.org/datasets/eukaryota_odb9.tar.gz (dataset) wget http://busco.ezlab.org/datasets/embryophyta_odb9.tar.gz (dataset) ln -s ../assembly/standard_output.gapfilled.final.fa . export AUGUSTUS_CONFIG_PATH="/data/skyts0401/program/augustus-3.2.3/config/" python /data/skyts0401/program/busco/scripts/run_BUSCO.py -i standard_output.gapfilled.final.fa -o Mungbean_busco -c 20 -l eukaryota_odb9/ -m geno python /data/skyts0401/program/busco/scripts/run_BUSCO.py -i standard_output.gapfilled.final.fa -o Mungbean_plant_busco -c 20 -l embryophyta_odb9/ -m geno
5/29
Mungbean pacbio assembly
Maker
Install
ncbi-blast+
(63:/data/skyts0401/program/) wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.6.0/ncbi-blast-2.6.0+-x64-linux.tar.gz tar -xvzf ncbi-blast-2.6.0+-x64-linux.tar.gz
exonerate
(63:/data/skyts0401/program/) git clone https://github.com/nathanweeks/exonerate.git cd exonerate/ git checkout v2.4.0 autoreconf -i ./configure make sudo make install
Maker
(63:/data/skyts0401/program/) download from http://www.yandell-lab.org/software/maker.html cd maker/ cd src/ nano ~/.profile (add $PATH=RepeatMasker) source ~/.profile perl Build.PL ./Build install
Running
Preparation
(63:/data/skyts0401/Mungbean/maker/) (Add PATH(/data/skyts0401/program/maker/bin) to ~/.profile) ln -s ../assembly/Vradi.pacbio.gapfilled.final.fa . mkdir ../transcriptome cd ../transcriptome/ scp skyts0401@147.46.250.244:/data/KangYJ/Mungbean/Transcriptome/merge/mungbean_merge.fa.cdhit.fa . cd ../maker/ ln -s ../transcriptome/mungbean_merge.fa.cdhit.fa . mkdir ref cd ref/ (download Fvesca annotation file from phytozome) unzip Fvesca_download.zip cd Fvesca/v1.1/annotation/ gunzip Fvesca_226_v1.1.protein.fa.gz gunzip Fvesca_226_v1.1.transcript.fa.gz cd ../../.. cp Fvesca/v1.1/annotation/Fvesca_226_v1.1.protein.fa . cp Fvesca/v1.1/annotation/Fvesca_226_v1.1.transcript . scp skyts0401@147.46.250.244:/alima9002/ref/forJat/Athaliana_167_TAIR10*.fa scp skyts0401@147.46.250.244:/alima9002/ref/forJat/Athaliana_167_TAIR10*.fa . scp skyts0401@147.46.250.244:/alima9002/ref/forJat/Gmax*.fa . scp skyts0401@147.46.250.244:/alima9002/ref/forJat/Ptrichocarpa*.fa . scp skyts0401@147.46.250.244:/alima9002/ref/forJat/Vvinifera*.fa . scp skyts0401@147.46.250.244:/alima9002/ref/forJat/Osativa*.fa . cd .. ln -s ../repeatmask/ProtExclude/allRepeats.libnoProtFinal mkdir tmp
Running
(63:/data/skyts0401/Mungbean/maker/) maker -CTL nano maker_bopts.ctl (default, check blast_type=ncbi+) nano maker_exe.ctl (change the path ncbi-blast+, RepeatMasker, exonerate, augustus) nano maker_opts.ctl (change the path genome, evidence(transcriptome, protein), repeat library, temporary directory) mpiexec -n 30 maker -fix_nucleotides maker_opts.ctl maker_bopts.ctl maker_exe.ctl >& maker_opts.ctl.log
6/26
Mungbean pacbio assembly
checking synteny block for chromosome split, combine
blast
(NICEM:~/data/Mungbean/blast) makeblastdb -in Vradi.ver6.cor.pep.fa -dbtype 'prot' blastall -i adzuki.ver3.pep.fa.tr.cor.fa -d Vradi.ver6.cor.pep.fa -p blastp -e 1e-10 -b 5 -v 5 -m 8 -o mcscanx/old_Va.blast # same procedure for other organism protein
MCSanX
(NICEM:~/data/Mungbean/blast/mcscanx) python gffcombine.py Vradi_ver6.gff.sorted.by.TY.gff adzuki.ver3.gene.gff.cor.gff > old_Va.gff ~/data/program/MCScanX/MCScanX old_Va # same procedure for other organism protein, just change the species name in gffcombine.py and command
Circos
(193:/data2/skyts0401/Mungbean/synteny/circos/) /data2/skyts0401/program/circos-0.69-4/bin/circos -conf synteny_Va_gene.conf -outputfile synteny_Va_gene.png # same procedure for other organism, change configuration file