####################### # Pipeline for sexing # ####################### # example pipeline for specimens 1196 and 1197 - all samples were run through this pipeline # 1. Map the data using paleomix module load java/v1.8.0_202-jdk python/v3.6.9 R/v3.6.1 htslib samtools bwa bowtie2 AdapterRemoval mapDamage paleomix chmod +x 2022_05_24_doubletusk_F_R1.yaml xsbatch -c 8 -J twotusks --mem-per-cpu 15000 --time 7-00:00:00 -- "paleomix bam run tusk_F_R1.yaml --max-threads 8 --bwa-max-threads 8 --adapterremoval-max-threads 8" # see tusk_F_R1.yaml for setting in paleomix # 2. Preparing bed files # this needs to be run only once - and not for each sample # Run Satsuma and map the narwhal reference genome (longest scaffolds) to cow X chromosome and Y human # sh SatsumaSynteny.sh $1 $2 $3 $4 $5 # SatsumaSynteny.sh can be found at https://github.com/andreidae/SeXY Replace each number with the following variables $1 - Threads $2 - Query sequence (reference genome assembly, RefGen) $3 - Target sequence (reference sex-chromosome assembly, RefX and RefY) $4 - Output_directory $5 - Satsuma directory #Extract regions mapping to X and Y independently: grep chromosome_Y satsuma_summary.chained.out | awk '{print $4"\t"$5"\t"$6}' > Y.bed grep chromosome_X satsuma_summary.chained.out | awk '{print $4"\t"$5"\t"$6}' > X.bed #Remove regions that overlap between the 2 from the X bed grep -v -f <(bedtools intersect -a X.bed -b Y.bed) X.bed | bedtools sort > X_trim.bed # Parse Satsuma output awk '{print $2,$3,$1}' X_trim.bed | perl -pe 's/\_M.*//' | awk '{print $3"\t"$1"\t"$2}' > X_trim.renamed.bed # Make autosomes bed file (reference.fasta.fai is made using samtools faidx reference.fasta) awk ‘{print $2,$3,$1}’ X.bed | perl -pe ‘s/\_M.*//’ | awk ‘{print $3"\t”$1"\t”$2}’ > X.renamed.bed awk ‘{print $2,$3,$1}’ Y.bed | perl -pe ‘s/\_M.*//’ | awk ‘{print $3"\t”$1"\t”$2}’ > Y.renamed.bed grep -v -w -f <(cat Y.renamed.bed X.renamed.bed | cut -f 1 | sort | uniq) /groups/hologenomics/ariglesi/data/Narwhal/References/GCF_005190385.1_NGI_Narwhal_1_genomic.fasta.fai | awk ‘{print $1"\t1\t”$2}’ > Autosomes_v2.bed # 3. Detecting human cont. in narwhal samples # making the narwhal bam files to fastq files module load htslib/v1.9 module load samtools for file in *bam do samtools bam2fq $file > $file.mapped.fastq done module load htslib/v1.9 module load samtools module load bwa # mapping the narwhal fastq to human genome to make bam files depleted from human reads (only keep reads not mapping to human genome) #! /bin/bash for fastq in *.mapped.fastq do bn=$(basename $fastq .mapped.fastq) echo "(bwa aln -l 999 /groups/hologenomics/data/genomes/human/hg38.UCSC.fasta $fastq | bwa samse /groups/hologenomics/data/genomes/human/hg38.UCSC.fasta - $fastq | samtools view -b -f 4 > ${bn}.human-depleted.unmapped.bam - )" done | xsbatch -c 1 --mem-per-cpu=10G -J twotusks -R --max-array-jobs=2 -- # Making a fastq file with the human depleted reads #! /bin/bash for file in /shared/volume/hologenomics/data/mlouis/Narwhal/nuclear2022/sexing_2tusks_2/*.human-depleted.unmapped.bam do samtools bam2fq $file > $file.unmapped.bam.depleted.fastq done # Mapping the human depleted fastq files to the narwhal genome #! /bin/bash for fastq in *.unmapped.bam.depleted.fastq do bn=$(basename $fastq .bam.mapped.fastq.human-depleted.unmapped.bam.depleted.fastq) echo "(bwa aln -l 999 /groups/hologenomics/ariglesi/data/Narwhal/References/GCF_005190385.1_NGI_Narwhal_1_genomic.fasta $fastq | bwa samse /groups/hologenomics/ariglesi/data/Narwhal/References/GCF_005190385.1_NGI_Narwhal_1_genomic.fasta - $fastq | samtools view -b -F 4 > ${bn}.humaDepleted.mappedNarwhal.bam - )" done | xsbatch -c 1 --mem-per-cpu=10G -J twotusks -R --max-array-jobs=10 -- #sorting, removing duplicates and sorting #! /bin/bash for bam in *.humaDepleted.mappedNarwhal.bam do bn=$(basename $bam .humaDepleted.mappedNarwhal.bam ) echo "samtools sort -n $bam | samtools fixmate -r -p -m - - | samtools sort - | samtools markdup -r - $bn.rmdup.sorted.bam" done | xsbatch -c 1 --mem-per-cpu=10G -J rmdup --time 02:00:00 -R --max-array-jobs=7 -- touch .dup.done #Estimating the coverage of the X and autosomes in the ancient samples after depletion # 4. Coverage script ## $1 Autosomes.bed ## $2 Satsumaname ## $3 X_trim.bed ## $4 Output_folder for file in *rmdup.sorted.bam do samtools depth -a -q 25 -Q 25 -b $1 $file > $4/$file"_"$2_Autosomes.txt samtools depth -a -q 25 -Q 25 -b $3 $file > $4/$file"_"$2_X_trim.txt for num in {1..10} do shuf -n10000000 $4/$file"_"$2_Autosomes.txt | awk '{sum=sum+$3}END{print sum/10000000}' >> $4/$file"_"$2_Autosomes_bootstrap.txt shuf -n10000000 $4/$file"_"$2_X_trim.txt | awk '{sum=sum+$3}END{print sum/10000000}' >> $4/$file"_"$2_X_trim_bootstrap.txt done paste $4/$file"_"$2_Autosomes_bootstrap.txt $4/$file"_"$2_X_trim_bootstrap.txt | awk '{print $2/$1}' > $4/$file"_"$2_ratios.txt rm $4/$file"_"$2_Autosomes.txt $4/$file"_"$2_X_trim.txt done