Difference between revisions of "2017 Haneul Lab note"

From Crop Genomics Lab.
Jump to: navigation, search
(Mungbean pacbio assembly)
(Mungbean pacbio assembly)
 
(56 intermediate revisions by one user not shown)
Line 284: Line 284:
 
  "out.gp", line 2594: wrong option
 
  "out.gp", line 2594: wrong option
 
It seems gnuplot was updated, so doesn't support that option resulted from mummerplot. just edit out.gp to delete that line.
 
It seems gnuplot was updated, so doesn't support that option resulted from mummerplot. just edit out.gp to delete that line.
 +
 +
/kev8305/skyts0401/program/last-842/scripts/last-dotplot -2 'Vr*' -2 'scaffold_?' -x 1920 -y 1920 ref_qry.maf plot.png
 +
 +
== 3/16 ~ ==
 +
=== Mungbean pacbio assembly ===
 +
compare between pacbio assembly and previous reference
 +
 +
1. 50 reseq marker/LG on previous reference mapping on pacbio super scaffold for checking same marker is on same chromosome. (244:/kev8305/SK3/anchoring/check)
 +
python SNP_marker_pos.py Vradi_ver6.fa Mungbean_chr_coseg_parse_seg_dist.loc > Vradi.ver6.reseq.marker.fasta
 +
makeblastdb -in JM-2.chr.fasta -dbtype 'nucl' -out Mungbean_pacbio
 +
blastn -db Mungbean_pacbio -query Vradi.ver6.reseq.marker.fasta -outfmt 6 -out reseq_marker.blast -num_threads 2 -evalue 1e-5 -word_size 100
 +
python blastparse.py reseq_marker.blast > reseq_marker_for_svg.result
 +
python chr_compare_svg.py fasta.size reseq_marker_for_svg.result > chr_compare_3.svg (output can be changed based on option in python code)
 +
 +
/data2/skyts0401/program/circos-0.69-4/bin/circos -conf chr_compare.conf (193:/data2/skyts0401/check/circos)
 +
 +
2. contig compare. (63:/data/skyts0401/Mungbean/assembly/)
 +
scp assembly@147.46.250.181:/home/assembly/data/Mungbean/mapping/p_ctg.longest.fa .
 +
scp skyts0401@147.46.250.244:/kev8305/SK3/anchoring/final.contigs.longest100.fa .
 +
gmap_build -d pacbio_contig_new p_ctg.longest.fa -D ./
 +
gmap -d pacbio_contig_new -D pacbio_contig_new/ final.contigs.longest100.fa -t 12 -f 1 > pacbio_contig_compare.psl
 +
--------------------------------------------------------------------------------------
 +
(NICEM:/home/assembly/check/)
 +
../bwa-0.7.15/bwa mem -t 30 p_ctg.longest.longest3.fa SunhwaN_1.fastq.gz SunhwaN_2.fastq.gz > newcontig_illumina.sam
 +
 
 +
(244:/kev8305/SK3/anchoring/check/)
 +
ln -s /NGS/NGS/VignaRadiata/DNA/Sunhwa_pacbio/filtered_subreads.fasta .
 +
bwa index p_ctg.longest.longest1.fa
 +
bwa mem -t 8 p_ctg.longest.longest1.fa filtered_subreads.fasta > newcontig_pacbio.sam
 +
 
 +
samtools view -Sb newcontig_pacbio.sam > newcontig_pacbio.bam
 +
samtools sort newcontig_pacbio.bam -o newcontig_pacbio.sorted.bam
 +
samtools index newcontig_pacbio.sorted.bam
 +
  ~ same samtools command with newcontig_illumina.sam ~
 +
 +
Find that something looked splited mapping, so re-align with end-to-end method of bowtie2
 +
(NICEM:~/check/)
 +
~/bowtie2-2.2.9/bowtie2-build p_ctg.longest.longest1.fa p_ctg.longest.longest1.fa
 +
~/bowtie2-2.2.9/bowtie2 -x p_ctg.longest.longest1.fa -1 SunhwaN_1.fastq.gz -2 SunhwaN_2.fastq.gz --end-to-end --very-fast -p 30 -S newcontig_illumina_endtoend.sam
 +
~/bowtie2-2.2.9/bowtie2 -x p_ctg.longest.longest1.fa -f filtered_subreads.fasta --end-to-end --very-fast -p 30 -S newcontig_pacbio_endtoend.sam
 +
 +
!!!bowtie2-2.3.0 version has a bug!!!
 +
 +
(244:/kev8305/SK3/anchoring/check/)
 +
scp assembly@147.46.250.181:/home/assembly/check/newcontig_pacbio_endtoend.sam .
 +
scp assembly@147.46.250.181:/home/assembly/check/newcontig_illumina_endtoend.sam .
 +
~ same samtools command, view, sort, index ~
 +
samtools depth -a -q 0 -Q 0 -r 000000F:2000000-4000000 newcontig_illumina_endtoend.sorted.bam > newcontig_illumina_endtoend.mapping.depth
 +
samtools depth -a -q 0 -Q 0 -r 000000F:2000000-4000000 newcontig_pacbio_endtoend.sorted.bam > newcontig_pacbio_endtoend.mapping.depth
 +
 +
blat for comparing contig (NICEM:/home/assembly/check/, 244:/kev8305/SK3/anchoring/check/)
 +
------------------------------
 +
(contig_compare.sh)
 +
#!/bin/bash
 +
 +
for i in {0..19}; do
 +
        ../blat p_ctg.longest.longest3.fa final.contigs_devide${i}.fa contig_compare_all_${i}.psl &
 +
done
 +
 +
wait
 +
------------------------------
 +
 +
(NICEM)
 +
python fasta_devide.py final.contigs.reformed.fasta
 +
chmod a+x contig_compare.sh
 +
./contig_compare.sh
 +
ls contig_compare_all_*.psl > psl.list
 +
nano pslfilter.py
 +
python pslfilter.py psl.list > conitg_compare_all.result
 +
python pslfilter2.py contig_compare_all.result > contig_compare_all_filtered.result
 +
 +
== 4/18 ==
 +
=== Jatropha assembly ===
 +
make Jatropha figure(chr - lg) for new version(allmaps) (244:/kev8305/skyts0401/Jatropha)
 +
scp skyts0401@147.46.250.63:/home/skyts0401/svg/make_chr_lg_svg.py make_chr_lg_svg_revised_for_allmaps.py
 +
python make_chr_lg_svg_revised_for_allmaps.py Jatropha_map1.result Jatropha.allmaps.agp > Jatropha_chr_lg.svg
 +
 +
== 4/26 ==
 +
=== Mungbean pacbio assembly ===
 +
mungbean super scaffold (JM-2.fasta) was gap filled. Final assembly Fasta is in /kev8305/SK3/anchoring/gapfilled_assembly_final/
 +
 +
== 5/1 ==
 +
=== Mungbean pacbio assembly ===
 +
Repeat masking progress is based on these sites: http://weatherby.genetics.utah.edu/MAKER/wiki/index.php/Repeat_Library_Construction-Advanced, http://www.repeatmasker.org/
 +
 +
 +
<big>'''Repeat masking program installation'''</big>
 +
 +
 +
Repbase - for RepeatMasker
 +
(63:/data/skyts0401/program/)
 +
should register http://www.girinst.org/
 +
download RepBaseRepeatMaskerEdition
 +
cp RepBaseRepeatMaskerEdition-20170127\ \(1\).tar.gz RepeatMasker/.
 +
cd RepeatMasker/
 +
tar -xvzf RepBaseRepeatMaskerEdition-20170127\ \(1\).tar.gz
 +
(Libraries/ diretory will be created and all file will be copied to RepeatMasker/Libraries/)
 +
 +
rmblast - for RepeatMasker (ver 2.6.0 has problem with install, so I installed v. 2.2.28)
 +
(63:/data/skyts0401/program/)
 +
download from http://www.repeatmasker.org/RMBlast.html
 +
wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/rmblast/2.2.28/ncbi-rmblastn-2.2.28-x64-linux.tar.gz
 +
wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.2.28/ncbi-blast-2.2.28+-x64-linux.tar.gz
 +
tar zxvf ncbi-blast-2.2.28+-x64-linux.tar.gz
 +
tar zxvf ncbi-rmblastn-2.2.28-x64-linux.tar.gz
 +
cp -R ncbi-rmblastn-2.2.28/* ncbi-blast-2.2.28+/
 +
rm -rf ncbi-rmblastn-2.2.28
 +
mv ncbi-blast-2.2.28+ rmblast-2.2.28
 +
 +
trf - for RepeatMasker
 +
(63:/data/skyts0401/program/)
 +
download from http://tandem.bu.edu/trf/trf.html
 +
chmod a+x trf409.linux64
 +
ln -s trf409.linux64 RepeatMasker/trf
 +
 +
RepeatMasker
 +
(63:/data/skyts0401/program/)
 +
wget http://www.repeatmasker.org/RepeatMasker-open-4-0-7.tar.gz
 +
tar -xzf RepeatMasker-open-4-0-7.tar.gz
 +
cd RepeatMasker/
 +
(move the Repbase library to RepeatMasker/Libraries/)
 +
perl ./configure
 +
configure directory of trf, rmblast
 +
 +
muscle - for MITE-Hunter
 +
(63:/data/skyts0401/program/)
 +
check version on http://www.drive5.com/muscle/
 +
wget http://www.drive5.com/muscle/downloads3.8.31/muscle3.8.31_i86linux64.tar.gz
 +
tar -xvzf muscle3.8.31_i86linux64.tar.gz
 +
mkdir muscle
 +
muscle3.8.31_i86linux64 muscle/
 +
 +
mdust - for MITE-Hunter
 +
(63:/data/skyts0401/program/)
 +
wget ftp://occams.dfci.harvard.edu/pub/bio/tgi/software//seqclean/mdust.tar.gz
 +
tar -xvzf mdust.tar.gz
 +
 +
MITE-Hunter
 +
(63:/data/skyts0401/program/)
 +
check version on http://target.iplantcollaborative.org/mite_hunter.html
 +
wget http://target.iplantcollaborative.org/mite_hunter/MITE%20Hunter-11-2011.zip
 +
unzip MITE\ Hunter-11-2011.zip
 +
mv MITE\ Hunter/ MITE_Hunter
 +
cd MITE_Hunter/
 +
perl MITE_Hunter_Installer.pl -d /data/skyts0401/program/MITE\ Hunter -f formatdb -b blastall -m /data/skyts0401/program/mdsut -M /data/skyts0401/program/muscle
 +
 +
GenomeTools
 +
(63:/data/skyts0401/program/)
 +
check version on http://genometools.org/
 +
wget http://genometools.org/pub/genometools-1.5.9.tar.gz
 +
tar -xvzf genometools-1.5.9.tar.gz
 +
cd genometools-1.5.9/
 +
make
 +
sudo make install
 +
- if have a problem with dependency, please check this -
 +
sudo apt-get install libcairo2-dev
 +
sudo apt-get install libpango1.0-dev
 +
 +
Genome tRNA database
 +
(63:/home/skyts0401/bin/)
 +
check version on http://gtrnadb.ucsc.edu
 +
wget http://gtrnadb2009.ucsc.edu/download/tRNAs/eukaryotic-tRNAs.fa.gz
 +
gunzip eukaryotic-tRNAs.fa.gz
 +
 +
CRL scripts
 +
(63:/home/skyts0401/bin/)
 +
wget http://www.hrt.msu.edu/uploads/535/78637/CRL_Scripts1.0.tar.gz
 +
tar -xvzf CRL_Scripts1.0.tar.gz
 +
 +
transposons protein database
 +
(63:/home/skyts0401/bin/)
 +
wget http://www.hrt.msu.edu/uploads/535/78637/Tpases020812DNA.gz
 +
gunzip Tpases020812DNA.gz
 +
wget http://www.hrt.msu.edu/uploads/535/78637/Tpases020812.gz
 +
gunzip Tpases020812.gz
 +
 +
plant protein database
 +
(63:/home/skyts0401/bin/)
 +
wget http://www.hrt.msu.edu/uploads/535/78637/alluniRefprexp070416.gz
 +
gunzip alluniRefprexp070416.gz
 +
 +
RECON - for RepeatModeler
 +
(63:/data/skyts0401/program/)
 +
wget http://www.repeatmasker.org/RepeatModeler/RECON-1.08.tar.gz
 +
tar -xvzf RECON-1.08.tar.gz
 +
cd RECON-1.08/src/
 +
make
 +
make install
 +
cd ../scripts/
 +
nano recon.pl (added /data/skyts0401/program/RECON-1.08/bin to PATH = "" (third line))
 +
 +
RepeatScout - for RepeatModeler
 +
(63:/data/skyts0401/program/)
 +
wget http://www.repeatmasker.org/RepeatScout-1.0.5.tar.gz
 +
tar -xvzf RepeatScout-1.0.5.tar.gz
 +
cd RepeatScout-1/
 +
make
 +
sudo make install
 +
 +
nseg - for RepeatModeler
 +
(63:/data/skyts0401/program/)
 +
mkdir nseg
 +
cd nseg
 +
wget ftp://ftp.ncbi.nih.gov/pub/seg/nseg/* .
 +
make
 +
 +
RepeatModeler
 +
(63:/data/skyts0401/program/)
 +
wget http://www.repeatmasker.org/RepeatModeler/RepeatModeler-open-1.0.9.tar.gz
 +
tar -xvzf RepeatModeler-open-1.0.9.tar.gz
 +
cd RepeatModeler-open-1.0.9/
 +
perl ./configure
 +
configure directory of RECON, RepeatScout, nseg, trf, rmblast
 +
 +
hmmer - for ProtExcluder
 +
(63:/data/skyts0401/program/)
 +
wget http://eddylab.org/software/hmmer3/3.1b2/hmmer-3.1b2-linux-intel-x86_64.tar.gz
 +
tar -xvzf hmmer-3.1b2-linux-intel-x86_64.tar.gz
 +
cd hmmer-3.1b2-linux-intel-x86_64/
 +
./configure
 +
make
 +
sudo make install
 +
 +
ProtExcluder
 +
wget http://www.hrt.msu.edu/uploads/535/78637/ProtExcluder1.2.tar.gz
 +
tar -xvzf ProtExcluder1.2.tar.gz
 +
cd ProtExcluder1.2/
 +
./Installer.pl -m /data/skyts0401/program/hmmer-3.1b2-linux-intel-x86_64/binaries/ -p /data/skyts0401/program/ProtExcluder1.2/
 +
 +
 +
<big>'''Repeat masking progress'''</big>
 +
 +
Basic command which I used is based on http://weatherby.genetics.utah.edu/MAKER/wiki/index.php/Repeat_Library_Construction-Advanced
 +
 +
 +
Move Mungbean genome assembly final version
 +
scp skyts0401@147.46.250.244:/kev8305/SK3/anchoring/gapfilled_assembly_final/standard_output.gapfilled.final.fa .
 +
 +
MITE library
 +
(63:/data/skyts0401/program/MITE_Hunter/)
 +
perl MITE_Hunter_manager.pl -i /data/skyts0401/Mungbean/assembly/standard_output.gapfilled.final.fa -g Mungbean -c 10 -S 12345678
 +
mv Mungbean* /data/skyts0401/Mungbean/repeatmask/MITE/.
 +
cd /data/skyts0401/Mungbean/repeatmask/MITE/
 +
cat Mungbean_Step8_*.fa > ../MITE.lib
 +
 +
LTR library
 +
(63:/data/skyts0401/Mungbean/repeatmask/LTR/99/)
 +
ln -s /data/skyts0401/Mungbean/assembly/standard_output.gapfilled.final.fa .
 +
gt suffixerator -db standard_output.gapfilled.final.fa -indexname Mungbean_LTR -tis -suf -lcp -des -ssp -dna
 +
gt ltrharvest -index Mungbean_LTR -out Mungbean.out99 -outinner Mungbean.outinner99 -gff3 Mungbean.gff99 -minlenltr 100 -maxlenltr 6000 0ministltr 1500 -maxdistltr 25000 -mintsd 5 -maxtsd 5 -motif tgca -similar 99 -vic 10 > Mungbean.result99
 +
gt gff3 -sort Mungbean.gff99 > Mungbean.gff99.sort
 +
gt ltrdigest -trnas ~/bin/eukaryotic-tRNAs.fa Mungbean.gff99.sort Mungbean_LTR > Mungbean.gff99.dgt
 +
perl ~/bin/CRL_Scripts1.0/CRL_Step1.pl --gff Mungbean.gff99.dgt
 +
perl ~/bin/CRL_Scripts1.0/CRL_Step2.pl --step1 CRL_Step1_Passed_Elements.txt --repeatfile Mungbean.out99 --resultfile Mungbean.result99 --sequencefile standard_output.gapfilled.final.fa --removed_repeats CRL_Step2_Passed_Elements.fasta
 +
mkdir fasta_files
 +
mv Repeat_*.fasta fasta_files/\
 +
mv Repeat_*.fasta fasta_files/
 +
mv CRL_Step2_Passed_Elements.fasta fasta_files/
 +
cd fasta_files/
 +
perl ~/bin/CRL_Scripts1.0/CRL_Step3.pl --directory . --step2 CRL_Step2_Passed_Elements.fasta --pidentity 60 --seq_c 25
 +
mv CRL_Step3_Passed_Elements.fasta ..
 +
cd ..
 +
perl ~/bin/CRL_Scripts1.0/ltr_library.pl --resultfile Mungbean.result99 --step3 CRL_Step3_Passed_Elements.fasta --sequencefile standard_output.gapfilled.final.fa
 +
cp lLTR_Only.lib ../lLTR_Only_99.lib
 +
cat lLTR_Only.lib ../../MITE/MITE.lib > repeats_to_mask_LTR99.fasta
 +
/data/skyts0401/program/RepeatMasker/RepeatMasker -lib repeats_to_mask_LTR99.fasta -nolow -dir . Mungbean.outinner99
 +
perl ~/bin/CRL_Scripts1.0/cleanRM.pl Mungbean.outinner99.out Mungbean.outinner99.masked > Mungbean.outinner99.unmasked
 +
perl ~/bin/CRL_Scripts1.0/rmshortinner.pl Mungbean.outinner99.unmasked 50 > Mungbean.outinner99.clean
 +
makeblastdb -in ~/bin/Tpases020812DNA -dbtype prot
 +
blastx -query Mungbean.outinner99.clean -db ~/bin/Tpases020812DNA -evalue 1e-10 -num_descriptions 10 -out Mungbean.outinner99.clean_blastx.out.txt
 +
perl ~/bin/CRL_Scripts1.0/outinner_blastx_parse.pl --blastx Mungbean.outinner99.clean_blastx.out.txt --outinner Mungbean.outinner99
 +
perl ~/bin/CRL_Scripts1.0/CRL_Step4.pl --step3 CRL_Step3_Passed_Elements.fasta --resultfile Mungbean.result99 --innerfile passed_outinner_sequence.fasta --sequencefile standard_output.gapfilled.final.fa
 +
makeblastdb -in lLTRs_Seq_For_BLAST.fasta -dbtype nucl
 +
blastn -query lLTRs_Seq_For_BLAST.fasta -db lLTRs_Seq_For_BLAST.fasta -evalue 1e-10 -num_descriptions 1000 -out lLTRs_Seq_For_BLAST.fasta.out
 +
makeblastdb -in Inner_Seq_For_BLAST.fasta -dbtype nucl
 +
blastn -query Inner_Seq_For_BLAST.fasta -db Inner_Seq_For_BLAST.fasta -evalue 1e-10 -num_descriptions 1000 -out Inner_Seq_For_BLAST.fasta.out
 +
perl ~/bin/CRL_Scripts1.0/CRL_Step5.pl --LTR_blast lLTRs_Seq_For_BLAST.fasta.out --inner_blast Inner_Seq_For_BLAST.fasta.out --step3 CRL_Step3_Passed_Elements.fasta --final LTR99.lib --pcoverage 90 --pidentity 80
 +
 +
relatively old LTR (Same command with above one, but for relatively old LTR)
 +
(63:/data/skyts0401/Mungbean/repeatmask/LTR/85)
 +
(to avoid confuse LTR_99 with this results, make directory 99 and 85 in LTR directory)
 +
ln -s /data/skyts0401/Mungbean/assembly/standard_output.gapfilled.final.fa .
 +
gt suffixerator -db standard_output.gapfilled.final.fa -indexname Mungbean_LTR -tis -suf -lcp -des -ssp -dna
 +
gt ltrharvest -index Mungbean_LTR -out Mungbean.out85 -outinner Mungbean.outinner85 -gff3 Mungbean.gff85 -minlenltr 100 -maxlenltr 6000 -mindistltr 1500 -maxdistltr 25000 -mintsd 5 -maxtsd 5 -vic 10  > Mungbean.result85
 +
cp ../99/CRL_Step1_Passed_Elements.txt .
 +
perl ~/bin/CRL_Scripts1.0/CRL_Step2.pl --step1 CRL_Step1_Passed_Elements.txt --repeatfile Mungbean.out85 --resultfile Mungbean.result85 --sequencefile standard_output.gapfilled.final.fa --removed_repeats CRL_Step2_Passed_Elements.fasta
 +
mkdir fasta_files
 +
mv Repeat_*.fasta fasta_files/
 +
mv CRL_Step2_Passed_Elements.fasta fasta_files/
 +
cd fasta_files/
 +
perl ~/bin/CRL_Scripts1.0/CRL_Step3.pl --directory . --step2 CRL_Step2_Passed_Elements.fasta --pidentity 60 --seq_c 25
 +
mv CRL_Step3_Passed_Elements.fasta ..
 +
cd ..
 +
perl ~/bin/CRL_Scripts1.0/ltr_library.pl --resultfile Mungbean.result85 --step3 CRL_Step3_Passed_Elements.fasta --sequencefile standard_output.gapfilled.final.fa
 +
cp lLTR_Only.lib ../lLTR_Only_85.lib
 +
cat lLTR_Only.lib ../../MITE/MITE.lib > repeats_to_mask_LTR85.fasta
 +
/data/skyts0401/program/RepeatMasker/RepeatMasker -lib repeats_to_mask_LTR85.fasta -nolow -dir . Mungbean.outinner85
 +
perl ~/bin/CRL_Scripts1.0/cleanRM.pl Mungbean.outinner85.out Mungbean.outinner85.masked > Mungbean.outinner85.unmasked
 +
perl ~/bin/CRL_Scripts1.0/rmshortinner.pl Mungbean.outinner85.unmasked 50 > Mungbean.outinner85.clean
 +
makeblastdb -in ~/bin/Tpases020812DNA -dbtype prot
 +
blastx -query Mungbean.outinner85.clean -db ~/bin/Tpases020812DNA -evalue 1e-10 -num_descriptions 10 -out Mungbean.outinner85.clean_blastx.out.txt
 +
perl ~/bin/CRL_Scripts1.0/outinner_blastx_parse.pl --blastx Mungbean.outinner85.clean_blastx.out.txt --outinner Mungbean.outinner85
 +
perl ~/bin/CRL_Scripts1.0/CRL_Step4.pl --step3 CRL_Step3_Passed_Elements.fasta --resultfile Mungbean.result85 --innerfile passed_outinner_sequence.fasta --sequencefile standard_output.gapfilled.final.fa
 +
makeblastdb -in lLTRs_Seq_For_BLAST.fasta -dbtype nucl
 +
blastn -query lLTRs_Seq_For_BLAST.fasta -db lLTRs_Seq_For_BLAST.fasta -evalue 1e-10 -num_descriptions 1000 -out lLTRs_Seq_For_BLAST.fasta.out
 +
makeblastdb -in Inner_Seq_For_BLAST.fasta -dbtype nucl
 +
blastn -query Inner_Seq_For_BLAST.fasta -db Inner_Seq_For_BLAST.fasta -evalue 1e-10 -num_descriptions 1000 -out Inner_Seq_For_BLAST.fasta.out
 +
perl ~/bin/CRL_Scripts1.0/CRL_Step5.pl --LTR_blast lLTRs_Seq_For_BLAST.fasta.out --inner_blast Inner_Seq_For_BLAST.fasta.out --step3 CRL_Step3_Passed_Elements.fasta --final LTR85.lib --pcoverage 90 --pidentity 8
 +
/data/skyts0401/program/RepeatMasker/RepeatMasker -lib ../99/LTR99.lib -dir . LTR85.lib
 +
perl ~/bin/CRL_Scripts1.0/remove_masked_sequence.pl --masked_elements LTR85.lib.masked --outfile FinalLTR85.lib
 +
cd ..
 +
cat 99/LTR99.lib 85/FinalLTR85.lib > allLTR.lib
 +
 +
Collecting repetitive sequences
 +
ln -s /data/skyts0401/Mungbean/assembly/standard_output.gapfilled.final.fa .
 +
nano fasta_devide.py
 +
python fasta_devide.py standard_output.gapfilled.final.fa
 +
nano repeatmask_combine.sh
 +
chmod a+x repeatmask_combine.sh
 +
./repeatmask_combine.sh
 +
cat standard_output.gapfilled.final_devide*.fa.masked > standard_output.gapfilled.final.fa.masked
 +
perl ~/bin/CRL_Scripts1.0/rmaskedpart.pl standard_output.gapfilled.final.fa.masked 50 > umseqfile
 +
/data/skyts0401/program/RepeatModeler-open-1.0.9/BuildDatabase -name umseqfildeb -engine ncbi umseqfile
 +
nohup /data/skyts0401/program/RepeatModeler-open-1.0.9/RepeatModeler -database umseqfiledb >& umseqfile.out
 +
perl ~/bin/CRL_Scripts1.0/repeatmodeler_parse.pl --fastafile consensi.fa.classified --unknowns repeatmodeler_unknowns.fasta --identities repeatmodeler_identities.fasta
 +
makeblastdb -in ~/bin/Tpases020812 -dbtype prot
 +
blastx -query repeatmodeler_unknowns.fasta -db ~/bin/Tpases020812 -evalue 1e-10 -num_descriptions 10 -out modelerunknown_blast_result.txt
 +
~/bin/CRL_Scripts1.0/transposon_blast_parse.pl --blastx modelerunknown_blast_result.txt --modelerunknown repeatmodeler_unknowns.fasta
 +
mv unknown_elements.txt ModelerUnknown.lib
 +
cat identified_elements.txt repeatmodeler_identities.fasta > ModelerID.lib
 +
 +
Exclusion of gene fragments
 +
makeblastdb -in ~/bin/alluniRefprexp070416 -dbtype prot
 +
blastx -query ModelerUnknown.lib -db ~/bin/alluniRefprexp070416 -evalue 1e-10 -num_descriptions 10 -out ModelerUnknown.lib_blast_result.txt
 +
cd LTR/
 +
python headerforamt.py allLTR.lib > allLTR.lib.reformed (LTR library has '(' symbol, resulting in ProtExcluder error, so change the format)
 +
cd ..
 +
mkdir ProtExclude
 +
cd ProtExclude/
 +
cp ../MITE/MITE.lib .
 +
cp ../LTR/allLTR.lib.reformed .
 +
cp ../ModelerID.lib .
 +
cp ../ModelerUnknown.lib .
 +
cat allLTR.lib.reformed MITE.lib ModelerID.lib > KnownRepeats.lib
 +
cat KnownRepeats.lib ModelerUnknown.lib > allRepeats.lib
 +
blastx -query allRepeats.lib -db ~/bin/alluniRefprexp070416 -evalue 1e-10 -num_descriptions 10 -out allRepeats.lib_blast_results.txt
 +
/data/skyts0401/program/ProtExcluder1.2/ProtExcluder.pl allRepeats.lib_blast_results.txt allRepeats.lib
 +
 +
== 5/26 ==
 +
=== Mungbean pacbio assembly ===
 +
For assessment of assembly, run CEGMA and BUSCO
 +
 +
 +
<big>'''Install'''</big>
 +
 +
CEGMA
 +
(63:/data/skyts0401/program/)
 +
sudo apt-get install wise (dependency)
 +
wget ftp://genome.crg.es/pub/software/geneid/geneid_v1.4.4.Jan_13_2011.tar.gz (dependency)
 +
tar -xvzf geneid_v1.4.4.Jan_13_2011.tar.gz
 +
cd geneid
 +
make
 +
make install
 +
nano ~/.profile (add $PATH:/data/skyts0401/program/geneid/bin)
 +
cd ..
 +
git clone https://github.com/KorfLab/CEGMA_v2.git
 +
cd CEGMA_v2/
 +
make
 +
 +
 +
BUSCO
 +
(63:/data/skyts0401/program/)
 +
wget http://bioinf.uni-greifswald.de/augustus/binaries/augustus-3.2.3.tar.gz (dependency)
 +
tar -xvzf augustus-3.2.3.tar.gz
 +
cd augustus-3.2.3/
 +
make (dependency error)
 +
sudo apt-get install bamtools libbamtools-dev
 +
make
 +
sudo make install
 +
cd ..
 +
git clone https://gitlab.com/ezlab/busco.git
 +
cd busco
 +
sudo python setup.py install
 +
cp config/config.ini.default config/config.ini
 +
nano config.ini (change the august path (path = /data/skyts0401/program/augustus-3.2.3/scripts/, bin/))
 +
 +
 +
<big>'''Running'''</big>
 +
 +
CEGMA
 +
(63:/data/skyts0401/Mungbean/cegma/)
 +
export CEGMA="/data/skyts0401/program/CEGMA_v2"
 +
export PERL5LIB="$PERL5LIB:$CEGMA/lib"
 +
export PERL5LIB=$CEGMA/lib:$PERL5LIB
 +
source ~/.profile
 +
/data/skyts0401/program/CEGMA_v2/bin/cegma --genome standard_output.gapfilled.final.fa -threads 5
 +
 +
 +
BUSCO
 +
(63:/data/skyts0401/Mungbean/busco/)
 +
wget http://busco.ezlab.org/datasets/eukaryota_odb9.tar.gz (dataset)
 +
wget http://busco.ezlab.org/datasets/embryophyta_odb9.tar.gz (dataset)
 +
ln -s ../assembly/standard_output.gapfilled.final.fa .
 +
export AUGUSTUS_CONFIG_PATH="/data/skyts0401/program/augustus-3.2.3/config/"
 +
python /data/skyts0401/program/busco/scripts/run_BUSCO.py -i standard_output.gapfilled.final.fa -o Mungbean_busco -c 20 -l eukaryota_odb9/ -m geno
 +
python /data/skyts0401/program/busco/scripts/run_BUSCO.py -i standard_output.gapfilled.final.fa -o Mungbean_plant_busco -c 20 -l embryophyta_odb9/ -m geno
 +
 +
== 5/29 ==
 +
=== Mungbean pacbio assembly ===
 +
Maker
 +
 +
 +
<big>'''Install'''</big>
 +
 +
ncbi-blast+
 +
(63:/data/skyts0401/program/)
 +
wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.6.0/ncbi-blast-2.6.0+-x64-linux.tar.gz
 +
tar -xvzf ncbi-blast-2.6.0+-x64-linux.tar.gz
 +
 +
 +
exonerate
 +
(63:/data/skyts0401/program/)
 +
git clone https://github.com/nathanweeks/exonerate.git
 +
cd exonerate/
 +
git checkout v2.4.0
 +
autoreconf -i
 +
./configure
 +
make
 +
sudo make install
 +
 +
 +
Maker
 +
(63:/data/skyts0401/program/)
 +
download from http://www.yandell-lab.org/software/maker.html
 +
cd maker/
 +
cd src/
 +
nano ~/.profile (add $PATH=RepeatMasker)
 +
source ~/.profile
 +
perl Build.PL
 +
./Build install
 +
 +
 +
<big>'''Running'''</big>
 +
 +
Preparation
 +
(63:/data/skyts0401/Mungbean/maker/)
 +
(Add PATH(/data/skyts0401/program/maker/bin) to ~/.profile)
 +
ln -s ../assembly/Vradi.pacbio.gapfilled.final.fa .
 +
mkdir ../transcriptome
 +
cd ../transcriptome/
 +
scp skyts0401@147.46.250.244:/data/KangYJ/Mungbean/Transcriptome/merge/mungbean_merge.fa.cdhit.fa .
 +
cd ../maker/
 +
ln -s ../transcriptome/mungbean_merge.fa.cdhit.fa .
 +
mkdir ref
 +
cd ref/
 +
(download Fvesca annotation file from phytozome)
 +
unzip Fvesca_download.zip
 +
cd Fvesca/v1.1/annotation/
 +
gunzip Fvesca_226_v1.1.protein.fa.gz
 +
gunzip Fvesca_226_v1.1.transcript.fa.gz
 +
cd ../../..
 +
cp Fvesca/v1.1/annotation/Fvesca_226_v1.1.protein.fa .
 +
cp Fvesca/v1.1/annotation/Fvesca_226_v1.1.transcript .
 +
scp skyts0401@147.46.250.244:/alima9002/ref/forJat/Athaliana_167_TAIR10*.fa
 +
scp skyts0401@147.46.250.244:/alima9002/ref/forJat/Athaliana_167_TAIR10*.fa .
 +
scp skyts0401@147.46.250.244:/alima9002/ref/forJat/Gmax*.fa .
 +
scp skyts0401@147.46.250.244:/alima9002/ref/forJat/Ptrichocarpa*.fa .
 +
scp skyts0401@147.46.250.244:/alima9002/ref/forJat/Vvinifera*.fa .
 +
scp skyts0401@147.46.250.244:/alima9002/ref/forJat/Osativa*.fa .
 +
cd ..
 +
ln -s ../repeatmask/ProtExclude/allRepeats.libnoProtFinal
 +
mkdir tmp
 +
 +
 +
Running
 +
(63:/data/skyts0401/Mungbean/maker/)
 +
maker -CTL
 +
nano maker_bopts.ctl (default, check blast_type=ncbi+)
 +
nano maker_exe.ctl (change the path ncbi-blast+, RepeatMasker, exonerate, augustus)
 +
nano maker_opts.ctl (change the path genome, evidence(transcriptome, protein), repeat library, temporary directory)
 +
mpiexec -n 30 maker -fix_nucleotides maker_opts.ctl maker_bopts.ctl maker_exe.ctl >& maker_opts.ctl.log
 +
 +
== 6/26 ==
 +
=== Mungbean pacbio assembly ===
 +
checking synteny block for chromosome split, combine
 +
 +
 +
blast
 +
(NICEM:~/data/Mungbean/blast)
 +
makeblastdb -in Vradi.ver6.cor.pep.fa -dbtype 'prot'
 +
blastall -i adzuki.ver3.pep.fa.tr.cor.fa -d Vradi.ver6.cor.pep.fa -p blastp -e 1e-10 -b 5 -v 5 -m 8 -o mcscanx/old_Va.blast
 +
# same procedure for other organism protein
 +
 +
 +
MCSanX
 +
(NICEM:~/data/Mungbean/blast/mcscanx)
 +
python gffcombine.py Vradi_ver6.gff.sorted.by.TY.gff adzuki.ver3.gene.gff.cor.gff > old_Va.gff
 +
~/data/program/MCScanX/MCScanX old_Va
 +
# same procedure for other organism protein, just change the species name in gffcombine.py and command
 +
 +
 +
Circos
 +
(193:/data2/skyts0401/Mungbean/synteny/circos/)
 +
/data2/skyts0401/program/circos-0.69-4/bin/circos -conf synteny_Va_gene.conf -outputfile synteny_Va_gene.png
 +
# same procedure for other organism, change configuration file

Latest revision as of 06:28, 4 July 2017

Contents

1 / 9

Minyoung_UV_QTL

parsing genotype data(Joinmap) and phenotype data to ICImapping format(bip format)
using lgcombine.py(63:/data/skyts0401/Mungbean/MY_UV/)
find linkage group 2 map is wrong, construct map newly


Mungbean synchronous QTL

make loc file(244:/home/skyts0401/reseq/chr/Mungbean_chr_coseq_parse_seg_dist.loc)
missing > 10%, hetero > 10, depth < 3 marker is filtered
while grouping them, find vr03, vr04 is combined in a group and vr05 is splited 2 groups, check it.


Mungbean pacbio assembly

moving SAM data(align reseq data on pacbio-scaffold) from NICEM server to 244 server (244:/kev8305/SK3/)

1 / 10

Minyoung_UV_QTL

QTL analysis by using IciMapping


Mungbean synchronous QTL

construct genetic map (JoinMap 4.1), just using chr 3, 4 combined and chr 5 splited linkage group.

ML method, Haldane algorithm


Mungbean pacbio assembly

convert SAM format to BAM format (244:/kev8305/SK3/)

./convertbam.sh

1/ 11

Mungbean synchronous QTL

QTL analysis by using RQTL(desktop:/Users/sky/desktop/Mungbean_syn_RQTL.csv)
just for checking locus

1/16

Mungbean pacbio assembly

coping sorted.bam file from 244 server to 63 server

variant calling (244:/kev8305/SK3/, 63:/data/skyts0401/Mungbean/mapping/resequencing/)

samtools mpileup -f falcon_500_sspace.final.scaffolds.fasta -v -t DP,AD,ADF,ADR,SP,INFO/AD,INFO/ADF,INFO/ADR -b bam_list | bcftools call -v -m -O v > variants.vcf
samtools mpileup -f falcon_500_sspace.final.scaffolds.fasta -I -v -t DP,AD,ADF,ADR,SP,INFO/AD,INFO/ADF,INFO/ADR -b bam_list | bcftools call -v -m -O v > variants_snp.vcf

1/18

Mungbean pacbio assembly

variant calling Kyoungki Jarae #5 with pacbio falcon scaffold (63:/data/skyts0401/Mungbean/mapping/resequencing/)

bwa index falcon_500_sspace.final.scaffolds.fasta
bwa mem -t 10 falcon_500_sspace.final.scaffolds.fasta KJ-C_1.fastq.gz KJ-C_2.fastq.gz > KJ-pe_falcon_scaffold.sam

1/19

Mungbean pacbio assembly

variant calling Kyoungki Jarae #5 with pacbio falcon scaffold (63:/data/skyts0401/Mungbean/mapping/resequencing/)

samtools view -Sb KJ-pe_falcon_scaffold.sam > KJ-pe_falcon_scaffold.bam
samtools sort KJ-pe_falcon_scaffold.bam -o KJ-pe_falcon_scaffold.sorted.bam
samtools index KJ-pe_falcon_scaffold.sorted.bam
samtools mpileup -f falcon_500_sspace.final.scaffolds.fasta -I -v -t DP,AD,ADF,ADR,SP,INFO/AD,INFO/ADF,INFO/ADR -u KJ-pe_falcon_scaffold.sorted.bam | bcftools call -v -m -O v > KJ_falcon_scaffold_variants_snp.vcf

1/24

Jatropha assembly

make svg file for superscaffold - linkage group marker location (63:/home/skyts0401/svg/)

python make_chr_lg_svg.py standard_output.final.scaffolds.fasta.tr.JM_out.fa standard_output.final.scaffolds.fasta LG.total.txt.reformed standard_output.final.scaffolds.fasta.tr.JM_out.fa.log > chr_lg.svg

1/31

Mungbean Chloroplast assembly

pairing Illumina PE read (63:/home/skyts0401/)

sudo python PE-pairing.py /data/jungminh/mungbean/PE/SunhwaN_1_cont.fq /data/jungminh/mungbean/PE/SunhwaN_2_cont.fq

2/2

Mungbean Chloroplast assembly

(63:/data/skyts0401/Mungbean/chloroplast/)

gmap_build -D gmap_db -d v.radiata v.radiata.fasta
gmap --nosplicing -D gmap_db -n 1 -d v.radiata -f samse scaf_cp_20k.fasta -t 12 | samtools view -Sb > Vr-cp_scaf-cp-20k.bam
samtools sort Vr-cp_scaf-cp-20k.bam -o Vr-cp_scaf-cp-20k.sorted.bam
samtools index Vr-cp_scaf-cp-20k.sorted.bam

2/3 ~ 2/6

Mungbean Chloroplast assembly

falcon - path : 63:/home/skyts0401/Falcon_RE/rere/

before run, copy fc_env folder (63:/data/skyts0401/Falcon/)

cp -r ~/FALCON_RE/rere/FALCON-integrate/fc_env YOUR_FOLDER

and configure file is on /home/skyts0401/fc_run.cfg


align canu contig_cp file to canu contig_cp assembly (63:/data/skyts0401/Mungbean/chloroplast/)

~/bowtie2-2.2.9/bowtie2-build Vr_cp_canu.contigs.for.mapping.fasta Vr_cp_canu.contigs.for.mapping.fasta
~/bowtie2-2.2.9/bowtie2 -x Vr_cp_canu.contigs.for.mapping.fasta -f canu_ctg_cp.fasta --end-to-end --very-fast -p 20 -S cp-assembly_canu_ctg_revised.sam
samtools view -Sb cp-assembly_canu-ctg.sam > cp-assembly_canu-ctg.bam
samtools sort cp-assembly_canu-ctg.bam -o cp-assembly_canu-ctg.sorted.bam
samtools index cp-assembly_canu-ctg.sorted.bam
samtools faidx canu_ctg_cp.fasta
....
~/bowtie2-2.2.9/bowtie2 -x Vr_cp_canu.contigs.for.mapping.fasta -f pb.cp.fasta --end-to-end --very-fast -p 20 -S cp-assembly_pb-cp.sam
....
~/bowtie2-2.2.9/bowtie2 -x Vr_cp_canu.contigs.for.mapping.fasta -1 SunhwaN_1_cont.fq.pairing.fq -2 SunhwaN_2_cont.fq.pairing.fq --end-to-end --very-fast -p 20 -S cp-assembly_PE-cp.sam

2/6

Mungbean Chloroplast assembly

assembly (canu) mungbean pacbio corrected read for chloroplast, parameter changed (63:/data/skyts0401/Mungbean/chloroplast/)

~/canu/Linux-amd64/bin/canu -assemble -p cp_read5 -d assembly/cp_read5 genomeSize=154k contigFilter="5 1000 0.75 0.75 2" -pacbio-corrected pb.cp.fasta
~/canu/Linux-amd64/bin/canu -assemble -p cp_read10 -d assembly/cp_read10 genomeSize=154k contigFilter="10 1000 0.75 0.75 2" -pacbio-corrected pb.cp.fasta

and we have 2 contigs (one contig have LSC+IR, and other contig have SSC+IR)

just assembly them(cp_1.fa, cp_2.fa, cp_3.fa)

2/9

Mungbean Chloroplast assembly

quiver(GenomicConsensus) install(63:/data/kev8305/skyts0401/program)

--- boost (ConsensusCore dependency) ---
wget https://sourceforge.net/projects/boost/files/boost/1.63.0/boost_1_63_0.tar.gz
tar -xf boost1_63_0.tar.gz
cd boost_1_63_0/
./bootstrap.sh
sudo apt-get install python-dev (solution for error-pyconfig.h)
sudo ./b2 install
--- swig (ConsensusCore dependency) ---
wget https://downloads.sourceforge.net/project/swig/swig/swig-3.0.12/swig-3.0.12.tar.g
tar -xf swig-3.0.12.tar.gz 
cd swig-3.0.12/
./configure 
make
sudo make install
--- ConsensusCore (GenomicConsensus dependency) ---
git clone https://github.com/PacificBiosciences/ConsensusCore.git
cd ConsensusCore/
sudo python setup.py install
--- GenomicConsensus ---
git clone https://github.com/PacificBiosciences/GenomicConsensus.git
sudo apt-get install libhdf5-serial-dev (solution for error-hdf5.h)
sudo make


Align PacBio_chloroplast read to vr.pb.cp.fasta(PacBio cp assembly) (63:/kev8305/Mungbean_assembly/chloroplast/)

bowtie2 -x vr.pb.cp.fasta -f pb.cp.fasta --end-to-end --very-fast -p 4 -S cp-assembly_pb-cp.sam
samtools view -Sb cp-assembly_pb-cp.sam > cp-assembly_pb-cp.bam

Align Illumina Paired-End read to vr.pb.cp.fasta(PacBio cp assembly) (63:/kev8305/Mungbean_assembly/chloroplast/)

bowtie2 -x vr.pb.cp.fasta -1 SunhwaN_1_cont.fq.pairing.fq -2 SunhwaN_2_cont.fq.pairing.fq --end-to-end --very-fast -p 4 -S cp-assembly_PE-cp.sam
samtools view -Sb cp-assembly_PE-cp.sam > cp-assembly_PE-cp.bam

Polishing by Quiver

2/10

Mungbean Chlroplast assembly

Quiver aligning Pacbio_chlroplast read to vr.pb.cp.fasta need to use pbalign, not bowtie or some other program.

pbalign install (63:/kev8305/skyts0401/program)

--- blasr (pbalign dependency) ---
https://github.com/PacificBiosciences/blasr/blob/master/doc/INSTALL_MAKE.md
--- pbcommand (quiver dependency) ---
git clone https://github.com/PacificBiosciences/pbcommand.git
cd pbcommand
sudo python setup.py install
--- pbalign ---
git clone https://github.com/PacificBiosciences/pbalign.git
cd pbalign/
sudo pip install .

pbalign (tried to align by using blasr algorithm , but sam or bam is no longer supported in blasr, so just use bowtie algorithm) (63:/kev8305/Mungbean_assembly/chloroplast/)

pbalign --noSplitSubreads --nproc 4 --algorithm bowtie pb.cp.fasta vr.pb.cp.fasta cp-assembly-pb-cp.for.quiver.sam

2/13

Mungbean Chloroplast assembly

Error occured while pbalign, so re-installed blasr(guess library error)

2/14

Mungbean Chloroplast assembly

variant calling with PE and PB read on chloroplast assembly genome (63:/kev8305/Mungbean_assembly/chloroplast/)

samtools mpileup -f vr.pb.cp.fasta -v -t DP,AD,ADF,ADR,SP,INFO/AD,INFO/ADF,INFO/ADR -u cp-assembly_pb-cp.sorted.bam | bcftools call -v -m -O v > vr.cp_pb_variants.vcf
samtools mpileup -f vr.pb.cp.fasta -v -t DP,AD,ADF,ADR,SP,INFO/AD,INFO/ADF,INFO/ADR -u cp-assembly_PE-cp.sorted.bam | bcftools call -v -m -O v > vr.cp_PE_variants.vcf

2/16

Mungbean Chloroplast assembly

Align PE reads to vr.pb.cp.fasta by using bwa (244:/kev8305/Mungbean_assembly/chloroplast/)

bwa index vr.pb.cp.fasta
bwa mem -t 4 vr.pb.cp.fasta SunhwaN_1_cont.fq.pairing.fq SunhwaN_2_cont.fq.pairing.fq > vr.pb.cp_PE.sam
samtools view -Sb vr.pb.cp_PE.sam > vr.pb.cp_PE.bam
samtools sort vr.pb.cp_PE.bam -o vr.pb.cp_PE.sorted.ba
samtools index vr.pb.cp_PE.sorted.bam
samtools mpileup -f vr.pb.cp.fasta -v -t DP,AD,ADF,ADR,SP,INFO/AD,INFO/ADF,INFO/ADR -u vr.pb.cp_PE.sorted.bam | bcftools call -v -m -O v > variants_PE_bwa.vcf
....
bwa mem -t 4 vr.pb.cp.fasta pb.cp.fasta > vr.pb.cp_PB.sam
samtools view -Sb vr.pb.cp_PB.sam > vr.pb.cp_PB.bam
samtools sort vr.pb.cp_PB.bam -o vr.pb.cp_PB.sorted.bam
samtools index vr.pb.cp_PB.sorted.bam
....
samtools mpileup -f vr.pb.cp.fasta -v -t DP,AD,ADF,ADR,SP,INFO/AD,INFO/ADF,INFO/ADR -u vr.pb.cp_PE.sorted.bam > variants_PE_bwa_all.vcf
python vcf_filtering.py variants_PE_bwa_all.vcf > variants_PE_bwa_all_0.15.vcf

2/21

Mungbean Chloroplast assembly

make a code that read fasta and annotation file(gff or gb) and make a fasta file with gene CDS sequence (63:/kev8305/Mungbean_assembly/chloroplast/)

python getCDS.py vr.pb.cp.fasta vr.pb.cp.gff > vr.pb.cp.gene.fasta
python getCDS.py v.radiata.fasta v.radiata.gb > v.radiata.gene.fasta

2/22

Mungbean pacbio assembly

snp calling done, snp filtering for genetic map construction (244:/kev8305/SK3/)

python ~/reseq/vcfparse_parent.py variants_snp.vcf KJ_falcon_scaffold_variants_snp.vcf
python ~/reseq/vcfparse.py variants_snp_compare_parents.vcf (dp >= 5, missing < 13, hetero < 10)
python ~/reseq/vcfparse_coseg.py variants_snp_compare_parents_filtered.vcf Mungbean_pacbio_scaffold_3.loc
python locparse.py Mungbean_pacbio_scaffold_3_seg_dist.loc > Mungbean_pacbio_scaffold_3_seg_dist_format.loc (scaffold name is too long, eliminate '|')

2/23

Mungbean pacbio assembly

too many snp for joinmap, so filtering missing < 12

python ~/reseq/vcfparse.py variants_snp_compare_parents.vcf
python ~/reseq/vcfparse_coseg.py variants_snp_compare_parents_filtered.vcf Mungbean_pacbio_scaffold_4.loc
python ~/reseq/cal_seg_dist.py Mungbean_pacbio_scaffold_4.loc 
9110
python locparse.py Mungbean_pacbio_scaffold_4_seg_dist.loc > Mungbean_pacbio_scaffold_4_seg_dist_format.loc

2/27

Mungbean pacbio assembly

Mugbean_pacbio_scaffold_7_seg_dist_foramt.loc : no hetero, missing < 18


ALLMAPS install (244:/kev8305/skyts0401/program)

easy_install biopython numpy deap networkx matplotlib jcvi
wget https://dl.dropboxusercontent.com/u/15937715/Data/ALLMAPS/ALLMAPS-install.sh
sh ALLMAPS-install.sh

and, add directory include ALLMAPS binnary code(concorde,faSize,liftOver) to $PATH in ~/.profile


ALLMAPS (244:/kev8305/SK3/anchoring)

python ~/reseq/allmaps_format.py Mungbean_pacbio_5_joinmap.result > Mungbean_pacbio_5_joinmap.for.allmaps
python ~/reseq/allmaps_format.py Mungbean_pacbio_7_joinmap.result > Mungbean_pacbio_7_joinmap.for.allmaps
python -m jcvi.assembly.allmaps merge Mungbean_pacbio_5_joinmap.for.allmaps Mungbean_pacbio_7_joinmap.for.allmaps -o JM-2.bed
python -m jcvi.assembly.allmaps path JM-2.bed falcon_500_sspace.final.scaffolds.fasta.header.fasta

3/2

Mungbean pacbio assembly

MUMmer install, for dot plot between pacbio and previous ref

wget https://downloads.sourceforge.net/project/mummer/mummer/3.23/MUMmer3.23.tar.gz
tar -xvf MUMmer3.23.tar.gz
cd MUMmer3.23
make check
make install
MUMmer3.23/mummer -mum -b -c Vradi.ver6.cor.fa.chr.fa JM-2.chr.fasta > ref_qry.mums

3/3

Mungbean pacbio assembly

lastz install (244:/kev8305/skyts0401/program/)

download from http://www.bx.psu.edu/~rsharris/lastz/
tar -xvzf lastz-1.02.00.tar.gz
cd lastz-distrib-1.02.00/src/
----------------------------------
problem with Makefile, so delete -Werror in line 31 of Makefile, save.
----------------------------------
make
make install

add path /home/skyts0401/lastz-distrib/bin in .profile


lastz (244:/kev8305/SK3/anchoring/)

lastz JM-2.chr.fasta[multiple] Vradi.ver6.cor.fa --notransition --step=20 --gfextend --chain --gapped --format=sam > old_new.sam

3/7

Mungbean pacbio assembly

MUMmer, having a problem with memory, was re-installed with a memory configuration

make clean
make CPPFLAGS="-O3 -DSIXTYFOURBITS"
make install


and use nucmer to align pacbio assembly and previous reference

MUMmer3.23/nucmer -maxmatch -c 100 -p ref_qry JM-2.chr.fasta Vradi.ver6.cor.fa
MUMmer3.23/nucmer --noextend -c 100 -p ref_qry_noextend JM-2.chr.fasta Vradi.ver6.cor.fa


and draw a dot plot using mummerplot

mummerplot --fat -l -png ref_qry_noextend.delta

but it occurs a error like

set mouse clipboardformat "[%.0f, %.0f]"
          ^
"out.gp", line 2594: wrong option

It seems gnuplot was updated, so doesn't support that option resulted from mummerplot. just edit out.gp to delete that line.

/kev8305/skyts0401/program/last-842/scripts/last-dotplot -2 'Vr*' -2 'scaffold_?' -x 1920 -y 1920 ref_qry.maf plot.png

3/16 ~

Mungbean pacbio assembly

compare between pacbio assembly and previous reference

1. 50 reseq marker/LG on previous reference mapping on pacbio super scaffold for checking same marker is on same chromosome. (244:/kev8305/SK3/anchoring/check)

python SNP_marker_pos.py Vradi_ver6.fa Mungbean_chr_coseg_parse_seg_dist.loc > Vradi.ver6.reseq.marker.fasta
makeblastdb -in JM-2.chr.fasta -dbtype 'nucl' -out Mungbean_pacbio
blastn -db Mungbean_pacbio -query Vradi.ver6.reseq.marker.fasta -outfmt 6 -out reseq_marker.blast -num_threads 2 -evalue 1e-5 -word_size 100
python blastparse.py reseq_marker.blast > reseq_marker_for_svg.result
python chr_compare_svg.py fasta.size reseq_marker_for_svg.result > chr_compare_3.svg (output can be changed based on option in python code)
/data2/skyts0401/program/circos-0.69-4/bin/circos -conf chr_compare.conf (193:/data2/skyts0401/check/circos)

2. contig compare. (63:/data/skyts0401/Mungbean/assembly/)

scp assembly@147.46.250.181:/home/assembly/data/Mungbean/mapping/p_ctg.longest.fa .
scp skyts0401@147.46.250.244:/kev8305/SK3/anchoring/final.contigs.longest100.fa .
gmap_build -d pacbio_contig_new p_ctg.longest.fa -D ./
gmap -d pacbio_contig_new -D pacbio_contig_new/ final.contigs.longest100.fa -t 12 -f 1 > pacbio_contig_compare.psl
--------------------------------------------------------------------------------------
(NICEM:/home/assembly/check/)
../bwa-0.7.15/bwa mem -t 30 p_ctg.longest.longest3.fa SunhwaN_1.fastq.gz SunhwaN_2.fastq.gz > newcontig_illumina.sam
 
(244:/kev8305/SK3/anchoring/check/)
ln -s /NGS/NGS/VignaRadiata/DNA/Sunhwa_pacbio/filtered_subreads.fasta .
bwa index p_ctg.longest.longest1.fa
bwa mem -t 8 p_ctg.longest.longest1.fa filtered_subreads.fasta > newcontig_pacbio.sam
 
samtools view -Sb newcontig_pacbio.sam > newcontig_pacbio.bam
samtools sort newcontig_pacbio.bam -o newcontig_pacbio.sorted.bam
samtools index newcontig_pacbio.sorted.bam
 ~ same samtools command with newcontig_illumina.sam ~

Find that something looked splited mapping, so re-align with end-to-end method of bowtie2

(NICEM:~/check/)
~/bowtie2-2.2.9/bowtie2-build p_ctg.longest.longest1.fa p_ctg.longest.longest1.fa
~/bowtie2-2.2.9/bowtie2 -x p_ctg.longest.longest1.fa -1 SunhwaN_1.fastq.gz -2 SunhwaN_2.fastq.gz --end-to-end --very-fast -p 30 -S newcontig_illumina_endtoend.sam
~/bowtie2-2.2.9/bowtie2 -x p_ctg.longest.longest1.fa -f filtered_subreads.fasta --end-to-end --very-fast -p 30 -S newcontig_pacbio_endtoend.sam

!!!bowtie2-2.3.0 version has a bug!!!
(244:/kev8305/SK3/anchoring/check/)
scp assembly@147.46.250.181:/home/assembly/check/newcontig_pacbio_endtoend.sam .
scp assembly@147.46.250.181:/home/assembly/check/newcontig_illumina_endtoend.sam .
~ same samtools command, view, sort, index ~
samtools depth -a -q 0 -Q 0 -r 000000F:2000000-4000000 newcontig_illumina_endtoend.sorted.bam > newcontig_illumina_endtoend.mapping.depth
samtools depth -a -q 0 -Q 0 -r 000000F:2000000-4000000 newcontig_pacbio_endtoend.sorted.bam > newcontig_pacbio_endtoend.mapping.depth

blat for comparing contig (NICEM:/home/assembly/check/, 244:/kev8305/SK3/anchoring/check/)

------------------------------
(contig_compare.sh)
#!/bin/bash

for i in {0..19}; do
        ../blat p_ctg.longest.longest3.fa final.contigs_devide${i}.fa contig_compare_all_${i}.psl &
done

wait
------------------------------
(NICEM)
python fasta_devide.py final.contigs.reformed.fasta 
chmod a+x contig_compare.sh 
./contig_compare.sh 
ls contig_compare_all_*.psl > psl.list
nano pslfilter.py 
python pslfilter.py psl.list > conitg_compare_all.result
python pslfilter2.py contig_compare_all.result > contig_compare_all_filtered.result

4/18

Jatropha assembly

make Jatropha figure(chr - lg) for new version(allmaps) (244:/kev8305/skyts0401/Jatropha)

scp skyts0401@147.46.250.63:/home/skyts0401/svg/make_chr_lg_svg.py make_chr_lg_svg_revised_for_allmaps.py
python make_chr_lg_svg_revised_for_allmaps.py Jatropha_map1.result Jatropha.allmaps.agp > Jatropha_chr_lg.svg

4/26

Mungbean pacbio assembly

mungbean super scaffold (JM-2.fasta) was gap filled. Final assembly Fasta is in /kev8305/SK3/anchoring/gapfilled_assembly_final/

5/1

Mungbean pacbio assembly

Repeat masking progress is based on these sites: http://weatherby.genetics.utah.edu/MAKER/wiki/index.php/Repeat_Library_Construction-Advanced, http://www.repeatmasker.org/


Repeat masking program installation


Repbase - for RepeatMasker

(63:/data/skyts0401/program/)
should register http://www.girinst.org/
download RepBaseRepeatMaskerEdition
cp RepBaseRepeatMaskerEdition-20170127\ \(1\).tar.gz RepeatMasker/.
cd RepeatMasker/
tar -xvzf RepBaseRepeatMaskerEdition-20170127\ \(1\).tar.gz
(Libraries/ diretory will be created and all file will be copied to RepeatMasker/Libraries/)

rmblast - for RepeatMasker (ver 2.6.0 has problem with install, so I installed v. 2.2.28)

(63:/data/skyts0401/program/)
download from http://www.repeatmasker.org/RMBlast.html
wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/rmblast/2.2.28/ncbi-rmblastn-2.2.28-x64-linux.tar.gz
wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.2.28/ncbi-blast-2.2.28+-x64-linux.tar.gz
tar zxvf ncbi-blast-2.2.28+-x64-linux.tar.gz 
tar zxvf ncbi-rmblastn-2.2.28-x64-linux.tar.gz 
cp -R ncbi-rmblastn-2.2.28/* ncbi-blast-2.2.28+/
rm -rf ncbi-rmblastn-2.2.28
mv ncbi-blast-2.2.28+ rmblast-2.2.28

trf - for RepeatMasker

(63:/data/skyts0401/program/)
download from http://tandem.bu.edu/trf/trf.html
chmod a+x trf409.linux64
ln -s trf409.linux64 RepeatMasker/trf

RepeatMasker

(63:/data/skyts0401/program/)
wget http://www.repeatmasker.org/RepeatMasker-open-4-0-7.tar.gz
tar -xzf RepeatMasker-open-4-0-7.tar.gz
cd RepeatMasker/
(move the Repbase library to RepeatMasker/Libraries/)
perl ./configure
configure directory of trf, rmblast

muscle - for MITE-Hunter

(63:/data/skyts0401/program/)
check version on http://www.drive5.com/muscle/
wget http://www.drive5.com/muscle/downloads3.8.31/muscle3.8.31_i86linux64.tar.gz
tar -xvzf muscle3.8.31_i86linux64.tar.gz
mkdir muscle
muscle3.8.31_i86linux64 muscle/

mdust - for MITE-Hunter

(63:/data/skyts0401/program/)
wget ftp://occams.dfci.harvard.edu/pub/bio/tgi/software//seqclean/mdust.tar.gz
tar -xvzf mdust.tar.gz

MITE-Hunter

(63:/data/skyts0401/program/)
check version on http://target.iplantcollaborative.org/mite_hunter.html
wget http://target.iplantcollaborative.org/mite_hunter/MITE%20Hunter-11-2011.zip
unzip MITE\ Hunter-11-2011.zip
mv MITE\ Hunter/ MITE_Hunter
cd MITE_Hunter/
perl MITE_Hunter_Installer.pl -d /data/skyts0401/program/MITE\ Hunter -f formatdb -b blastall -m /data/skyts0401/program/mdsut -M /data/skyts0401/program/muscle

GenomeTools

(63:/data/skyts0401/program/)
check version on http://genometools.org/
wget http://genometools.org/pub/genometools-1.5.9.tar.gz
tar -xvzf genometools-1.5.9.tar.gz
cd genometools-1.5.9/
make
sudo make install
- if have a problem with dependency, please check this -
sudo apt-get install libcairo2-dev
sudo apt-get install libpango1.0-dev

Genome tRNA database

(63:/home/skyts0401/bin/)
check version on http://gtrnadb.ucsc.edu
wget http://gtrnadb2009.ucsc.edu/download/tRNAs/eukaryotic-tRNAs.fa.gz
gunzip eukaryotic-tRNAs.fa.gz

CRL scripts

(63:/home/skyts0401/bin/)
wget http://www.hrt.msu.edu/uploads/535/78637/CRL_Scripts1.0.tar.gz
tar -xvzf CRL_Scripts1.0.tar.gz

transposons protein database

(63:/home/skyts0401/bin/)
wget http://www.hrt.msu.edu/uploads/535/78637/Tpases020812DNA.gz
gunzip Tpases020812DNA.gz
wget http://www.hrt.msu.edu/uploads/535/78637/Tpases020812.gz
gunzip Tpases020812.gz

plant protein database

(63:/home/skyts0401/bin/)
wget http://www.hrt.msu.edu/uploads/535/78637/alluniRefprexp070416.gz
gunzip alluniRefprexp070416.gz

RECON - for RepeatModeler

(63:/data/skyts0401/program/)
wget http://www.repeatmasker.org/RepeatModeler/RECON-1.08.tar.gz
tar -xvzf RECON-1.08.tar.gz
cd RECON-1.08/src/
make
make install
cd ../scripts/
nano recon.pl (added /data/skyts0401/program/RECON-1.08/bin to PATH = "" (third line))

RepeatScout - for RepeatModeler

(63:/data/skyts0401/program/)
wget http://www.repeatmasker.org/RepeatScout-1.0.5.tar.gz
tar -xvzf RepeatScout-1.0.5.tar.gz
cd RepeatScout-1/
make
sudo make install

nseg - for RepeatModeler

(63:/data/skyts0401/program/)
mkdir nseg
cd nseg
wget ftp://ftp.ncbi.nih.gov/pub/seg/nseg/* .
make

RepeatModeler

(63:/data/skyts0401/program/)
wget http://www.repeatmasker.org/RepeatModeler/RepeatModeler-open-1.0.9.tar.gz
tar -xvzf RepeatModeler-open-1.0.9.tar.gz 
cd RepeatModeler-open-1.0.9/
perl ./configure
configure directory of RECON, RepeatScout, nseg, trf, rmblast

hmmer - for ProtExcluder

(63:/data/skyts0401/program/)
wget http://eddylab.org/software/hmmer3/3.1b2/hmmer-3.1b2-linux-intel-x86_64.tar.gz
tar -xvzf hmmer-3.1b2-linux-intel-x86_64.tar.gz 
cd hmmer-3.1b2-linux-intel-x86_64/
./configure 
make
sudo make install

ProtExcluder

wget http://www.hrt.msu.edu/uploads/535/78637/ProtExcluder1.2.tar.gz
tar -xvzf ProtExcluder1.2.tar.gz 
cd ProtExcluder1.2/
./Installer.pl -m /data/skyts0401/program/hmmer-3.1b2-linux-intel-x86_64/binaries/ -p /data/skyts0401/program/ProtExcluder1.2/


Repeat masking progress

Basic command which I used is based on http://weatherby.genetics.utah.edu/MAKER/wiki/index.php/Repeat_Library_Construction-Advanced


Move Mungbean genome assembly final version

scp skyts0401@147.46.250.244:/kev8305/SK3/anchoring/gapfilled_assembly_final/standard_output.gapfilled.final.fa .

MITE library

(63:/data/skyts0401/program/MITE_Hunter/)
perl MITE_Hunter_manager.pl -i /data/skyts0401/Mungbean/assembly/standard_output.gapfilled.final.fa -g Mungbean -c 10 -S 12345678
mv Mungbean* /data/skyts0401/Mungbean/repeatmask/MITE/.
cd /data/skyts0401/Mungbean/repeatmask/MITE/
cat Mungbean_Step8_*.fa > ../MITE.lib

LTR library

(63:/data/skyts0401/Mungbean/repeatmask/LTR/99/)
ln -s /data/skyts0401/Mungbean/assembly/standard_output.gapfilled.final.fa .
gt suffixerator -db standard_output.gapfilled.final.fa -indexname Mungbean_LTR -tis -suf -lcp -des -ssp -dna
gt ltrharvest -index Mungbean_LTR -out Mungbean.out99 -outinner Mungbean.outinner99 -gff3 Mungbean.gff99 -minlenltr 100 -maxlenltr 6000 0ministltr 1500 -maxdistltr 25000 -mintsd 5 -maxtsd 5 -motif tgca -similar 99 -vic 10 > Mungbean.result99
gt gff3 -sort Mungbean.gff99 > Mungbean.gff99.sort
gt ltrdigest -trnas ~/bin/eukaryotic-tRNAs.fa Mungbean.gff99.sort Mungbean_LTR > Mungbean.gff99.dgt
perl ~/bin/CRL_Scripts1.0/CRL_Step1.pl --gff Mungbean.gff99.dgt 
perl ~/bin/CRL_Scripts1.0/CRL_Step2.pl --step1 CRL_Step1_Passed_Elements.txt --repeatfile Mungbean.out99 --resultfile Mungbean.result99 --sequencefile standard_output.gapfilled.final.fa --removed_repeats CRL_Step2_Passed_Elements.fasta
mkdir fasta_files
mv Repeat_*.fasta fasta_files/\
mv Repeat_*.fasta fasta_files/
mv CRL_Step2_Passed_Elements.fasta fasta_files/
cd fasta_files/
perl ~/bin/CRL_Scripts1.0/CRL_Step3.pl --directory . --step2 CRL_Step2_Passed_Elements.fasta --pidentity 60 --seq_c 25
mv CRL_Step3_Passed_Elements.fasta ..
cd ..
perl ~/bin/CRL_Scripts1.0/ltr_library.pl --resultfile Mungbean.result99 --step3 CRL_Step3_Passed_Elements.fasta --sequencefile standard_output.gapfilled.final.fa 
cp lLTR_Only.lib ../lLTR_Only_99.lib
cat lLTR_Only.lib ../../MITE/MITE.lib > repeats_to_mask_LTR99.fasta
/data/skyts0401/program/RepeatMasker/RepeatMasker -lib repeats_to_mask_LTR99.fasta -nolow -dir . Mungbean.outinner99
perl ~/bin/CRL_Scripts1.0/cleanRM.pl Mungbean.outinner99.out Mungbean.outinner99.masked > Mungbean.outinner99.unmasked
perl ~/bin/CRL_Scripts1.0/rmshortinner.pl Mungbean.outinner99.unmasked 50 > Mungbean.outinner99.clean
makeblastdb -in ~/bin/Tpases020812DNA -dbtype prot
blastx -query Mungbean.outinner99.clean -db ~/bin/Tpases020812DNA -evalue 1e-10 -num_descriptions 10 -out Mungbean.outinner99.clean_blastx.out.txt
perl ~/bin/CRL_Scripts1.0/outinner_blastx_parse.pl --blastx Mungbean.outinner99.clean_blastx.out.txt --outinner Mungbean.outinner99
perl ~/bin/CRL_Scripts1.0/CRL_Step4.pl --step3 CRL_Step3_Passed_Elements.fasta --resultfile Mungbean.result99 --innerfile passed_outinner_sequence.fasta --sequencefile standard_output.gapfilled.final.fa 
makeblastdb -in lLTRs_Seq_For_BLAST.fasta -dbtype nucl
blastn -query lLTRs_Seq_For_BLAST.fasta -db lLTRs_Seq_For_BLAST.fasta -evalue 1e-10 -num_descriptions 1000 -out lLTRs_Seq_For_BLAST.fasta.out
makeblastdb -in Inner_Seq_For_BLAST.fasta -dbtype nucl
blastn -query Inner_Seq_For_BLAST.fasta -db Inner_Seq_For_BLAST.fasta -evalue 1e-10 -num_descriptions 1000 -out Inner_Seq_For_BLAST.fasta.out
perl ~/bin/CRL_Scripts1.0/CRL_Step5.pl --LTR_blast lLTRs_Seq_For_BLAST.fasta.out --inner_blast Inner_Seq_For_BLAST.fasta.out --step3 CRL_Step3_Passed_Elements.fasta --final LTR99.lib --pcoverage 90 --pidentity 80

relatively old LTR (Same command with above one, but for relatively old LTR)

(63:/data/skyts0401/Mungbean/repeatmask/LTR/85)
(to avoid confuse LTR_99 with this results, make directory 99 and 85 in LTR directory)
ln -s /data/skyts0401/Mungbean/assembly/standard_output.gapfilled.final.fa .
gt suffixerator -db standard_output.gapfilled.final.fa -indexname Mungbean_LTR -tis -suf -lcp -des -ssp -dna
gt ltrharvest -index Mungbean_LTR -out Mungbean.out85 -outinner Mungbean.outinner85 -gff3 Mungbean.gff85 -minlenltr 100 -maxlenltr 6000 -mindistltr 1500 -maxdistltr 25000 -mintsd 5 -maxtsd 5 -vic 10  > Mungbean.result85
cp ../99/CRL_Step1_Passed_Elements.txt .
perl ~/bin/CRL_Scripts1.0/CRL_Step2.pl --step1 CRL_Step1_Passed_Elements.txt --repeatfile Mungbean.out85 --resultfile Mungbean.result85 --sequencefile standard_output.gapfilled.final.fa --removed_repeats CRL_Step2_Passed_Elements.fasta
mkdir fasta_files
mv Repeat_*.fasta fasta_files/
mv CRL_Step2_Passed_Elements.fasta fasta_files/
cd fasta_files/
perl ~/bin/CRL_Scripts1.0/CRL_Step3.pl --directory . --step2 CRL_Step2_Passed_Elements.fasta --pidentity 60 --seq_c 25
mv CRL_Step3_Passed_Elements.fasta ..
cd ..
perl ~/bin/CRL_Scripts1.0/ltr_library.pl --resultfile Mungbean.result85 --step3 CRL_Step3_Passed_Elements.fasta --sequencefile standard_output.gapfilled.final.fa 
cp lLTR_Only.lib ../lLTR_Only_85.lib
cat lLTR_Only.lib ../../MITE/MITE.lib > repeats_to_mask_LTR85.fasta
/data/skyts0401/program/RepeatMasker/RepeatMasker -lib repeats_to_mask_LTR85.fasta -nolow -dir . Mungbean.outinner85
perl ~/bin/CRL_Scripts1.0/cleanRM.pl Mungbean.outinner85.out Mungbean.outinner85.masked > Mungbean.outinner85.unmasked
perl ~/bin/CRL_Scripts1.0/rmshortinner.pl Mungbean.outinner85.unmasked 50 > Mungbean.outinner85.clean
makeblastdb -in ~/bin/Tpases020812DNA -dbtype prot
blastx -query Mungbean.outinner85.clean -db ~/bin/Tpases020812DNA -evalue 1e-10 -num_descriptions 10 -out Mungbean.outinner85.clean_blastx.out.txt
perl ~/bin/CRL_Scripts1.0/outinner_blastx_parse.pl --blastx Mungbean.outinner85.clean_blastx.out.txt --outinner Mungbean.outinner85
perl ~/bin/CRL_Scripts1.0/CRL_Step4.pl --step3 CRL_Step3_Passed_Elements.fasta --resultfile Mungbean.result85 --innerfile passed_outinner_sequence.fasta --sequencefile standard_output.gapfilled.final.fa 
makeblastdb -in lLTRs_Seq_For_BLAST.fasta -dbtype nucl
blastn -query lLTRs_Seq_For_BLAST.fasta -db lLTRs_Seq_For_BLAST.fasta -evalue 1e-10 -num_descriptions 1000 -out lLTRs_Seq_For_BLAST.fasta.out
makeblastdb -in Inner_Seq_For_BLAST.fasta -dbtype nucl
blastn -query Inner_Seq_For_BLAST.fasta -db Inner_Seq_For_BLAST.fasta -evalue 1e-10 -num_descriptions 1000 -out Inner_Seq_For_BLAST.fasta.out
perl ~/bin/CRL_Scripts1.0/CRL_Step5.pl --LTR_blast lLTRs_Seq_For_BLAST.fasta.out --inner_blast Inner_Seq_For_BLAST.fasta.out --step3 CRL_Step3_Passed_Elements.fasta --final LTR85.lib --pcoverage 90 --pidentity 8
/data/skyts0401/program/RepeatMasker/RepeatMasker -lib ../99/LTR99.lib -dir . LTR85.lib
perl ~/bin/CRL_Scripts1.0/remove_masked_sequence.pl --masked_elements LTR85.lib.masked --outfile FinalLTR85.lib
cd ..
cat 99/LTR99.lib 85/FinalLTR85.lib > allLTR.lib

Collecting repetitive sequences

ln -s /data/skyts0401/Mungbean/assembly/standard_output.gapfilled.final.fa .
nano fasta_devide.py
python fasta_devide.py standard_output.gapfilled.final.fa
nano repeatmask_combine.sh
chmod a+x repeatmask_combine.sh 
./repeatmask_combine.sh 
cat standard_output.gapfilled.final_devide*.fa.masked > standard_output.gapfilled.final.fa.masked
perl ~/bin/CRL_Scripts1.0/rmaskedpart.pl standard_output.gapfilled.final.fa.masked 50 > umseqfile
/data/skyts0401/program/RepeatModeler-open-1.0.9/BuildDatabase -name umseqfildeb -engine ncbi umseqfile 
nohup /data/skyts0401/program/RepeatModeler-open-1.0.9/RepeatModeler -database umseqfiledb >& umseqfile.out
perl ~/bin/CRL_Scripts1.0/repeatmodeler_parse.pl --fastafile consensi.fa.classified --unknowns repeatmodeler_unknowns.fasta --identities repeatmodeler_identities.fasta
makeblastdb -in ~/bin/Tpases020812 -dbtype prot
blastx -query repeatmodeler_unknowns.fasta -db ~/bin/Tpases020812 -evalue 1e-10 -num_descriptions 10 -out modelerunknown_blast_result.txt
~/bin/CRL_Scripts1.0/transposon_blast_parse.pl --blastx modelerunknown_blast_result.txt --modelerunknown repeatmodeler_unknowns.fasta
mv unknown_elements.txt ModelerUnknown.lib
cat identified_elements.txt repeatmodeler_identities.fasta > ModelerID.lib

Exclusion of gene fragments

makeblastdb -in ~/bin/alluniRefprexp070416 -dbtype prot
blastx -query ModelerUnknown.lib -db ~/bin/alluniRefprexp070416 -evalue 1e-10 -num_descriptions 10 -out ModelerUnknown.lib_blast_result.txt
cd LTR/
python headerforamt.py allLTR.lib > allLTR.lib.reformed (LTR library has '(' symbol, resulting in ProtExcluder error, so change the format)
cd ..
mkdir ProtExclude
cd ProtExclude/
cp ../MITE/MITE.lib .
cp ../LTR/allLTR.lib.reformed .
cp ../ModelerID.lib .
cp ../ModelerUnknown.lib .
cat allLTR.lib.reformed MITE.lib ModelerID.lib > KnownRepeats.lib
cat KnownRepeats.lib ModelerUnknown.lib > allRepeats.lib
blastx -query allRepeats.lib -db ~/bin/alluniRefprexp070416 -evalue 1e-10 -num_descriptions 10 -out allRepeats.lib_blast_results.txt
/data/skyts0401/program/ProtExcluder1.2/ProtExcluder.pl allRepeats.lib_blast_results.txt allRepeats.lib

5/26

Mungbean pacbio assembly

For assessment of assembly, run CEGMA and BUSCO


Install

CEGMA

(63:/data/skyts0401/program/)
sudo apt-get install wise (dependency)
wget ftp://genome.crg.es/pub/software/geneid/geneid_v1.4.4.Jan_13_2011.tar.gz (dependency)
tar -xvzf geneid_v1.4.4.Jan_13_2011.tar.gz
cd geneid
make
make install
nano ~/.profile (add $PATH:/data/skyts0401/program/geneid/bin)
cd ..
git clone https://github.com/KorfLab/CEGMA_v2.git
cd CEGMA_v2/
make


BUSCO

(63:/data/skyts0401/program/)
wget http://bioinf.uni-greifswald.de/augustus/binaries/augustus-3.2.3.tar.gz (dependency)
tar -xvzf augustus-3.2.3.tar.gz 
cd augustus-3.2.3/
make (dependency error)
sudo apt-get install bamtools libbamtools-dev
make
sudo make install
cd ..
git clone https://gitlab.com/ezlab/busco.git
cd busco
sudo python setup.py install
cp config/config.ini.default config/config.ini
nano config.ini (change the august path (path = /data/skyts0401/program/augustus-3.2.3/scripts/, bin/))


Running

CEGMA

(63:/data/skyts0401/Mungbean/cegma/)
export CEGMA="/data/skyts0401/program/CEGMA_v2"
export PERL5LIB="$PERL5LIB:$CEGMA/lib"
export PERL5LIB=$CEGMA/lib:$PERL5LIB
source ~/.profile 
/data/skyts0401/program/CEGMA_v2/bin/cegma --genome standard_output.gapfilled.final.fa -threads 5


BUSCO

(63:/data/skyts0401/Mungbean/busco/)
wget http://busco.ezlab.org/datasets/eukaryota_odb9.tar.gz (dataset)
wget http://busco.ezlab.org/datasets/embryophyta_odb9.tar.gz (dataset)
ln -s ../assembly/standard_output.gapfilled.final.fa .
export AUGUSTUS_CONFIG_PATH="/data/skyts0401/program/augustus-3.2.3/config/"
python /data/skyts0401/program/busco/scripts/run_BUSCO.py -i standard_output.gapfilled.final.fa -o Mungbean_busco -c 20 -l eukaryota_odb9/ -m geno
python /data/skyts0401/program/busco/scripts/run_BUSCO.py -i standard_output.gapfilled.final.fa -o Mungbean_plant_busco -c 20 -l embryophyta_odb9/ -m geno

5/29

Mungbean pacbio assembly

Maker


Install

ncbi-blast+

(63:/data/skyts0401/program/)
wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.6.0/ncbi-blast-2.6.0+-x64-linux.tar.gz
tar -xvzf ncbi-blast-2.6.0+-x64-linux.tar.gz


exonerate

(63:/data/skyts0401/program/)
git clone https://github.com/nathanweeks/exonerate.git
cd exonerate/
git checkout v2.4.0
autoreconf -i
./configure
make
sudo make install


Maker

(63:/data/skyts0401/program/)
download from http://www.yandell-lab.org/software/maker.html
cd maker/
cd src/
nano ~/.profile (add $PATH=RepeatMasker)
source ~/.profile
perl Build.PL
./Build install


Running

Preparation

(63:/data/skyts0401/Mungbean/maker/)
(Add PATH(/data/skyts0401/program/maker/bin) to ~/.profile)
ln -s ../assembly/Vradi.pacbio.gapfilled.final.fa .
mkdir ../transcriptome
cd ../transcriptome/
scp skyts0401@147.46.250.244:/data/KangYJ/Mungbean/Transcriptome/merge/mungbean_merge.fa.cdhit.fa .
cd ../maker/
ln -s ../transcriptome/mungbean_merge.fa.cdhit.fa .
mkdir ref
cd ref/
(download Fvesca annotation file from phytozome)
unzip Fvesca_download.zip 
cd Fvesca/v1.1/annotation/
gunzip Fvesca_226_v1.1.protein.fa.gz 
gunzip Fvesca_226_v1.1.transcript.fa.gz 
cd ../../..
cp Fvesca/v1.1/annotation/Fvesca_226_v1.1.protein.fa .
cp Fvesca/v1.1/annotation/Fvesca_226_v1.1.transcript .
scp skyts0401@147.46.250.244:/alima9002/ref/forJat/Athaliana_167_TAIR10*.fa
scp skyts0401@147.46.250.244:/alima9002/ref/forJat/Athaliana_167_TAIR10*.fa .
scp skyts0401@147.46.250.244:/alima9002/ref/forJat/Gmax*.fa .
scp skyts0401@147.46.250.244:/alima9002/ref/forJat/Ptrichocarpa*.fa .
scp skyts0401@147.46.250.244:/alima9002/ref/forJat/Vvinifera*.fa .
scp skyts0401@147.46.250.244:/alima9002/ref/forJat/Osativa*.fa .
cd ..
ln -s ../repeatmask/ProtExclude/allRepeats.libnoProtFinal
mkdir tmp


Running

(63:/data/skyts0401/Mungbean/maker/)
maker -CTL
nano maker_bopts.ctl (default, check blast_type=ncbi+)
nano maker_exe.ctl (change the path ncbi-blast+, RepeatMasker, exonerate, augustus)
nano maker_opts.ctl (change the path genome, evidence(transcriptome, protein), repeat library, temporary directory)
mpiexec -n 30 maker -fix_nucleotides maker_opts.ctl maker_bopts.ctl maker_exe.ctl >& maker_opts.ctl.log

6/26

Mungbean pacbio assembly

checking synteny block for chromosome split, combine


blast

(NICEM:~/data/Mungbean/blast)
makeblastdb -in Vradi.ver6.cor.pep.fa -dbtype 'prot'
blastall -i adzuki.ver3.pep.fa.tr.cor.fa -d Vradi.ver6.cor.pep.fa -p blastp -e 1e-10 -b 5 -v 5 -m 8 -o mcscanx/old_Va.blast
# same procedure for other organism protein


MCSanX

(NICEM:~/data/Mungbean/blast/mcscanx)
python gffcombine.py Vradi_ver6.gff.sorted.by.TY.gff adzuki.ver3.gene.gff.cor.gff > old_Va.gff
~/data/program/MCScanX/MCScanX old_Va
# same procedure for other organism protein, just change the species name in gffcombine.py and command


Circos

(193:/data2/skyts0401/Mungbean/synteny/circos/)
/data2/skyts0401/program/circos-0.69-4/bin/circos -conf synteny_Va_gene.conf -outputfile synteny_Va_gene.png
# same procedure for other organism, change configuration file