## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
library(SNPfiltR)
library(vcfR)

## -----------------------------------------------------------------------------
#load the example vcfR object 
data(vcfR.example)

### check the metadata present in your vcf
vcfR.example

vcfR.example@fix[1:10,1:8]

vcfR.example@gt[1:10,1:2]

#Load the example popmap file. It is a standard two column popmap, where the first column must be named 'id' and contain individual sample identifiers matching the sample identifiers in the vcf file, and the second column must be named 'pop', and contain a population assignment for each sample.
data(popmap)
popmap

## ---- fig.height= 4, fig.width=6----------------------------------------------
#generate exploratory visualizations of depth and genotype quality for all called genotypes
#hard_filter(vcfR=vcfR.example)

#hard filter to minimum depth of 5, and minimum genotype quality of 30
vcfR<-hard_filter(vcfR=vcfR.example, depth = 5, gq = 30)

## ---- fig.height= 3, fig.width=4----------------------------------------------
#execute allele balance filter
vcfR<-filter_allele_balance(vcfR)

## ---- fig.height= 3, fig.width=4----------------------------------------------
#visualize and pick appropriate max depth cutoff
#max_depth(vcfR)
#not running here to save space on visualizations

#filter vcf by the max depth cutoff you chose
vcfR<-max_depth(vcfR, maxdepth = 100)

#check vcfR to see how many SNPs we have left
vcfR

## ---- fig.height= 3, fig.width=4----------------------------------------------
#run function to visualize samples and return informative data.frame object
miss<-missing_by_sample(vcfR=vcfR)

#run function to drop samples above the threshold we want from the vcf
#here I am setting a relatively lax cutoff
vcfR<-missing_by_sample(vcfR=vcfR, cutoff = .9)

#remove invariant sites generated by sample trimming and genotype filtering
vcfR<-min_mac(vcfR, min.mac = 1)

#update popmap by removing samples that have been filtered out
popmap<-popmap[popmap$id %in% colnames(vcfR@gt)[-1],]

## ---- fig.height= 3, fig.width=4----------------------------------------------
#visualize missing data by SNP and the effect of various cutoffs on the missingness of each sample
missing_by_snp(vcfR)

## -----------------------------------------------------------------------------
#assess missing data effects on clustering
assess_missing_data_pca(vcfR = vcfR, popmap = popmap, thresholds = c(.8), clustering = FALSE)

## ---- fig.height= 3, fig.width=4----------------------------------------------
#show me the samples with the most missing data at an 80% completeness threshold
filt<-miss[miss$filt == .8,]
filt[order(filt$snps.retained),]

#drop the three samples with an excess of missing data at an 80% SNP completeness threshold
vcfR<- vcfR[,colnames(vcfR@gt) != "A_coerulescens_396263" & colnames(vcfR@gt) != "A_woodhouseii_334134" & colnames(vcfR@gt) != "A_coerulescens_396256"]

#remove invariant SNPs
vcfR<-min_mac(vcfR, min.mac = 1)
vcfR

#update popmap by removing samples that have been filtered out
popmap<-popmap[popmap$id %in% colnames(vcfR@gt)[-1],]

## ---- fig.height= 3, fig.width=4----------------------------------------------
missing_by_snp(vcfR)
#all samples look good at most thresholds, because of the small size of this dataset, I will choose a 60% completeness threshold in order to retain as many SNPs as possible

#filter vcfR
vcfR<-missing_by_snp(vcfR, cutoff = .6)

#look at final stats for our filtered vcf file
vcfR

## -----------------------------------------------------------------------------
#write out vcf
#vcfR::write.vcf(vcfR, file = "~/Downloads/scrub.jay.example.filtered.vcf.gz")

