Hi! Sorry for the late reply, somehow the email notification went to my spam folder.
Here is the original NumPy code called introStatFuncs.py
import numpy as np
import pandas as pd
###############################################################################
###Common Functions###
###############################################################################
###calculate derived allele frequencies###
def alleleFreq(genotypeMatrix, POP):
###calculate allel frequencies per varaint
freq = genotypeMatrix[:,POP].sum(axis=1)/float(len(POP))
return freq
###calculate ABBA values from allele frequencies (Durand et al 2011)###
def ABBA(genotypeMatrix, Outgroup, POP1, POP2, POP3):
###calculate allel frequencies per population
po = alleleFreq(genotypeMatrix, Outgroup)
p1 = alleleFreq(genotypeMatrix, POP1)
p2 = alleleFreq(genotypeMatrix, POP2)
p3 = alleleFreq(genotypeMatrix, POP3)
###calculate ABBA values
ABBA = (1-p1)*(p2)*(p3)*(1-po)
return ABBA
###calculate BABA values from allele frequencies (Durand et al 2011)###
def BABA(genotypeMatrix, Outgroup, POP1, POP2, POP3):
###calculate allel frequencies per population
po = alleleFreq(genotypeMatrix, Outgroup)
p1 = alleleFreq(genotypeMatrix, POP1)
p2 = alleleFreq(genotypeMatrix, POP2)
p3 = alleleFreq(genotypeMatrix, POP3)
###calculate BABA values
BABA = (p1)*(1-p2)*(p3)*(1-po)
return BABA
###calculate genetic distances between two pops (Smith and Kronforst 2013)###
def dXYfreq(genotypeMatrix, POPX, POPY, seqLen):
###calculate allel frequencies per varaint
px = alleleFreq(genotypeMatrix, POPX)
py = alleleFreq(genotypeMatrix, POPY)
###calculate sequence divergence
pxy = (px)*(1-py)
pyx = (py)*(1-px)
psum = pxy+pyx
ptot = psum.sum()
###divide by the sequence length for average pairwise sequence divergence
dXY = ptot/float(seqLen)
return dXY
###############################################################################
###Introgression Statistics###
###############################################################################
###calculate Patterson's D from allele frequencies (Durand et al 2011)###
def D(genotypeMatrix, Outgroup, POP1, POP2, POP3):
###calculate the numerator and denominator
abba = ABBA(genotypeMatrix, Outgroup, POP1, POP2, POP3)
baba = BABA(genotypeMatrix, Outgroup, POP1, POP2, POP3)
num = abba-baba
den = abba+baba
Dnum = num.sum()
Dden = den.sum()
###return -100 if the denominator is 0 else return the D value
if (Dden != 0):
D = Dnum/float(Dden)
else:
D = np.nan
return D
###calculate fd from allele frequencies (Martin et al 2015)###
def fd(genotypeMatrix, Outgroup, POP1, POP2, POP3):
###calculate ADDA and DADA values
po = alleleFreq(genotypeMatrix, Outgroup)
p1 = alleleFreq(genotypeMatrix, POP1)
p2 = alleleFreq(genotypeMatrix, POP2)
p3 = alleleFreq(genotypeMatrix, POP3)
pd = np.maximum(p2, p3)
adda = (1-p1)*(pd)*(pd)*(1-po)
dada = (p1)*(1-pd)*(pd)*(1-po)
###calculate the numerator and denominator
abba = ABBA(genotypeMatrix, Outgroup, POP1, POP2, POP3)
baba = BABA(genotypeMatrix, Outgroup, POP1, POP2, POP3)
num = abba-baba
den = adda-dada
fdNUM = num.sum()
fdDEN = den.sum()
###return -100 if the denominator is 0 else return the fd value
if (fdDEN != 0):
fd = fdNUM/float(fdDEN)
else:
fd = np.nan
return fd
###calculate RNDmin (Rosenzweig et al 2016)###
def RNDmin(genotypeMatrix, POPX, POPY, Outgroup, seqLen):
###calculate dOUT
dXO = dXYfreq(genotypeMatrix, POPX, Outgroup, seqLen)
dYO = dXYfreq(genotypeMatrix, POPY, Outgroup, seqLen)
dOUT = (dXO+dYO)/float(2)
###calculate dMIN
PWdiff = []
for i in POPX:
for j in POPY:
PWdiff.append(np.count_nonzero((genotypeMatrix[:,i] == genotypeMatrix[:,j]) == False))
minPWdiff = min(PWdiff)
dMIN = minPWdiff/float(seqLen)
###calculate RNDmin
if (dOUT != 0):
RNDmin = dMIN/float(dOUT)
else:
RNDmin = np.nan
return RNDmin
###calculate df (Pfeifer and Kapan 2019)###
def df(genotypeMatrix, POP1, POP2, POP3):
###implement laplace smoothing for POP1
p1MAT = genotypeMatrix[:,POP1]
anc1 = np.zeros((p1MAT.shape[0],1))
der1 = np.ones((p1MAT.shape[0],1))
POP1L = np.hstack((p1MAT,anc1,der1))
p1L = POP1L.sum(axis=1)/float(POP1L.shape[1])
###implement laplace smoothing for POP2
p2MAT = genotypeMatrix[:,POP2]
anc2 = np.zeros((p2MAT.shape[0],1))
der2 = np.ones((p2MAT.shape[0],1))
POP2L = np.hstack((p2MAT,anc2,der2))
p2L = POP2L.sum(axis=1)/float(POP2L.shape[1])
###calculate genetic distances based on allele frequencies
p3 = alleleFreq(genotypeMatrix, POP3)
d13 = (p1L*(1-p3))+((1-p1L)*p3)
d23 = (p2L*(1-p3))+((1-p2L)*p3)
###calculate the numerator and denominator
num = (p2L*d13)-(p1L*d23)
den = (p2L*d13)+(p1L*d23)
dfNUM = num.sum()
dfDEN = den.sum()
###return -100 if the denominator is 0 else return the df value
if (dfDEN != 0):
df = dfNUM/dfDEN
else:
df = np.nan
return df
###calculate D3 (Hahn and Hibbins 2019)###
def D3(genotypeMatrix, POP1, POP2, POP3, seqLen):
###calcualte average number of pairwise differences
dAC = dXYfreq(genotypeMatrix, POP1, POP3, seqLen)
dBC = dXYfreq(genotypeMatrix, POP2, POP3, seqLen)
###calculate the numerator and denominator
D3num = dBC-dAC
D3den = dBC+dAC
###return -100 if the denominator is 0 else return the D3 value
if (D3den != 0):
D3 = D3num/D3den
else:
D3 = np.nan
return D3
###calculate D+###
def Dplus(genotypeMatrix, Outgroup, POP1, POP2, POP3):
###calculate ABBA and BABA values
abba = ABBA(genotypeMatrix, Outgroup, POP1, POP2, POP3)
baba = BABA(genotypeMatrix, Outgroup, POP1, POP2, POP3)
###calculate allel frequencies per population
po = alleleFreq(genotypeMatrix, Outgroup)
p1 = alleleFreq(genotypeMatrix, POP1)
p2 = alleleFreq(genotypeMatrix, POP2)
p3 = alleleFreq(genotypeMatrix, POP3)
###calculate BAAA and ABAA values
baaa = (p1)*(1-p2)*(1-p3)*(1-po)
abaa = (1-p1)*(p2)*(1-p3)*(1-po)
###calculate the numerator and denominator
num = (abba-baba)+(baaa-abaa)
den = abba+baba+baaa+abaa
DplusNUM = num.sum()
DplusDEN = den.sum()
###return -100 if the denominator is 0 else return the D+ value
if (DplusDEN != 0):
Dplus = DplusNUM/float(DplusDEN)
else:
Dplus = np.nan
return Dplus
And here is the Numba code (in lazy mode) called fast_stats.py
from numba import njit
import numpy as np
###############################################################################
###Common Functions###
###############################################################################
@njit(fastmath=True, cache=True)
def allele_freq(genotype_matrix, POP):
"""
Calculate derived allele frequencies
"""
# Calculate allel frequencies per varaint
freq = genotype_matrix[:,POP].sum(axis=1)/float(len(POP))
return freq
@njit(fastmath=True, cache=True)
def ABBA(genotype_matrix, Outgroup, POP1, POP2, POP3):
"""
Calculate ABBA values from allele frequencies (Durand et al 2011)
"""
# Calculate allel frequencies per population
po = allele_freq(genotype_matrix, Outgroup)
p1 = allele_freq(genotype_matrix, POP1)
p2 = allele_freq(genotype_matrix, POP2)
p3 = allele_freq(genotype_matrix, POP3)
# Calculate ABBA values
ABBA = (1-p1)*(p2)*(p3)*(1-po)
return ABBA
@njit(fastmath=True, cache=True)
def BABA(genotype_matrix, Outgroup, POP1, POP2, POP3):
"""
Calculate BABA values from allele frequencies (Durand et al 2011)
"""
# Calculate allel frequencies per population
po = allele_freq(genotype_matrix, Outgroup)
p1 = allele_freq(genotype_matrix, POP1)
p2 = allele_freq(genotype_matrix, POP2)
p3 = allele_freq(genotype_matrix, POP3)
# Calculate BABA values
BABA = (p1)*(1-p2)*(p3)*(1-po)
return BABA
@njit(fastmath=True, cache=True)
def dXY_freq(genotype_matrix, POPX, POPY, seq_len):
"""
Calculate genetic distances between two pops (Smith and Kronforst 2013)
"""
# Calculate allel frequencies per varaint
px = allele_freq(genotype_matrix, POPX)
py = allele_freq(genotype_matrix, POPY)
# Calculate sequence divergence
pxy = (px)*(1-py)
pyx = (py)*(1-px)
psum = pxy+pyx
ptot = psum.sum()
# Divide by the sequence length for average pairwise sequence divergence
dXY = ptot/float(seq_len)
return dXY
###############################################################################
###Introgression Statistics###
###############################################################################
@njit(fastmath=True, cache=True)
def D(genotype_matrix, Outgroup, POP1, POP2, POP3):
"""
Calculate Patterson's D from allele frequencies (Durand et al 2011)
"""
# Calculate the numerator and denominator
abba = ABBA(genotype_matrix, Outgroup, POP1, POP2, POP3)
baba = BABA(genotype_matrix, Outgroup, POP1, POP2, POP3)
num = abba-baba
den = abba+baba
Dnum = num.sum()
Dden = den.sum()
# Return nan if the denominator is 0 else return the D value
if (Dden != 0):
D = Dnum/float(Dden)
else:
D = np.nan
return D
@njit(fastmath=True, cache=True)
def fd(genotype_matrix, Outgroup, POP1, POP2, POP3):
"""
Calculate fd from allele frequencies (Martin et al 2015)
"""
# Calculate ADDA and DADA values
po = allele_freq(genotype_matrix, Outgroup)
p1 = allele_freq(genotype_matrix, POP1)
p2 = allele_freq(genotype_matrix, POP2)
p3 = allele_freq(genotype_matrix, POP3)
pd = np.maximum(p2, p3)
adda = (1-p1)*(pd)*(pd)*(1-po)
dada = (p1)*(1-pd)*(pd)*(1-po)
# Calculate the numerator and denominator
abba = ABBA(genotype_matrix, Outgroup, POP1, POP2, POP3)
baba = BABA(genotype_matrix, Outgroup, POP1, POP2, POP3)
num = abba-baba
den = adda-dada
fdNUM = num.sum()
fdDEN = den.sum()
# Return nan if the denominator is 0 else return the fd value
if (fdDEN != 0):
fd = fdNUM/float(fdDEN)
else:
fd = np.nan
return fd
@njit(fastmath=True, cache=True)
def RNDmin(genotype_matrix, POPX, POPY, Outgroup, seq_len):
"""
Calculate RNDmin (Rosenzweig et al 2016)
"""
# Calculate dOUT
dXO = dXY_freq(genotype_matrix, POPX, Outgroup, seq_len)
dYO = dXY_freq(genotype_matrix, POPY, Outgroup, seq_len)
dOUT = (dXO+dYO)/float(2)
# Calculate dMIN
PWdiff = []
for i in POPX:
for j in POPY:
PWdiff.append(len(np.nonzero((genotype_matrix[:,i] == genotype_matrix[:,j]) == np.bool(False))[0]))
minPWdiff = min(PWdiff)
dMIN = minPWdiff/float(seq_len)
# Return nan if the denominator is 0 else return the RNDmin value
if (dOUT != 0):
RNDmin = dMIN/float(dOUT)
else:
RNDmin = np.nan
return RNDmin
@njit(fastmath=True, cache=True)
def df(genotype_matrix, POP1, POP2, POP3):
"""
Calculate df (Pfeifer and Kapan 2019)
"""
# Implement laplace smoothing for POP1
p1MAT = genotype_matrix[:,POP1]
anc1 = np.zeros((p1MAT.shape[0],1))
der1 = np.ones((p1MAT.shape[0],1))
POP1L = np.hstack((p1MAT,anc1,der1))
p1L = POP1L.sum(axis=1)/float(POP1L.shape[1])
# Implement laplace smoothing for POP2
p2MAT = genotype_matrix[:,POP2]
anc2 = np.zeros((p2MAT.shape[0],1))
der2 = np.ones((p2MAT.shape[0],1))
POP2L = np.hstack((p2MAT,anc2,der2))
p2L = POP2L.sum(axis=1)/float(POP2L.shape[1])
# Calculate genetic distances based on allele frequencies
p3 = allele_freq(genotype_matrix, POP3)
d13 = (p1L*(1-p3))+((1-p1L)*p3)
d23 = (p2L*(1-p3))+((1-p2L)*p3)
# Calculate the numerator and denominator
num = (p2L*d13)-(p1L*d23)
den = (p2L*d13)+(p1L*d23)
dfNUM = num.sum()
dfDEN = den.sum()
# Return nan if the denominator is 0 else return the df value
if (dfDEN != 0):
df = dfNUM/dfDEN
else:
df = np.nan
return df
@njit(fastmath=True, cache=True)
def D3(genotype_matrix, POP1, POP2, POP3, seq_len):
"""
Calculate D3 (Hahn and Hibbins 2019)
"""
# Calcualte average number of pairwise differences
dAC = dXY_freq(genotype_matrix, POP1, POP3, seq_len)
dBC = dXY_freq(genotype_matrix, POP2, POP3, seq_len)
# Calculate the numerator and denominator
D3num = dBC-dAC
D3den = dBC+dAC
# Return nan if the denominator is 0 else return the D3 value
if (D3den != 0):
D3 = D3num/D3den
else:
D3 = np.nan
return D3
@njit(fastmath=True, cache=True)
def Dplus(genotype_matrix, Outgroup, POP1, POP2, POP3):
"""
Calculate D+
"""
# Calculate ABBA and BABA values
abba = ABBA(genotype_matrix, Outgroup, POP1, POP2, POP3)
baba = BABA(genotype_matrix, Outgroup, POP1, POP2, POP3)
# Calculate allel frequencies per population
po = allele_freq(genotype_matrix, Outgroup)
p1 = allele_freq(genotype_matrix, POP1)
p2 = allele_freq(genotype_matrix, POP2)
p3 = allele_freq(genotype_matrix, POP3)
# Calculate BAAA and ABAA values
baaa = (p1)*(1-p2)*(1-p3)*(1-po)
abaa = (1-p1)*(p2)*(1-p3)*(1-po)
# Calculate the numerator and denominator
num = (abba-baba)+(baaa-abaa)
den = abba+baba+baaa+abaa
DplusNUM = num.sum()
DplusDEN = den.sum()
# Return nan if the denominator is 0 else return the D+ value
if (DplusDEN != 0):
Dplus = DplusNUM/float(DplusDEN)
else:
Dplus = np.nan
return Dplus
Here is the python code from the package msprime that is needed to run my simulations, which is called msprime_models.py
import msprime
def one_pulse(
seq_len,
num_seq,
N,
tp123,
tp12,
tgf,
mu,
r,
gamma,
num_reps=1,
random_seed=None):
"""
Simulates four populations (P1, P2, P3, and an outgroup) where the only
demographic events are population divergences and one pulse of
introgression. Sequence length is in base pairs and both mutation and
recombination rate are in base pairs per generation. This function returns
a tree sequence file.
"""
# Define simulation parameters
samp_size = num_seq # Number of sequences
prob_introgress = gamma # Probability of a lineage introgressing
T = (4 * N) # Ancestral population P123
Tp3 = (tp123 * N) # P123 splits to P12 and P3
Tp2 = (tp12 * N) # P12 splits to P1 and P2
Tgf = (tgf * N) # Introgression event
# Define samples
samples = [msprime.Sample(population=0,time=0)]*samp_size # Outgroup
samples.extend([msprime.Sample(population=1,time=0)]*samp_size) # P1
samples.extend([msprime.Sample(population=2,time=0)]*samp_size) # P2
samples.extend([msprime.Sample(population=3,time=0)]*samp_size) # P3
# Configure populations with a constant population size
population_configurations = [
msprime.PopulationConfiguration(initial_size=10000),
msprime.PopulationConfiguration(initial_size=10000),
msprime.PopulationConfiguration(initial_size=10000),
msprime.PopulationConfiguration(initial_size=10000),
]
# Define demographic events
demographic_events = [
msprime.MassMigration(
time=Tgf, source=2, destination=3, proportion=prob_introgress), # Introgression event
msprime.MassMigration(
time=Tp2, source=2, destination=1, proportion=1.0), # P1 and P2 merge
msprime.MassMigration(
time=Tp3, source=3, destination=1, proportion=1.0), # P12 and P3 merge
msprime.MassMigration(
time=T, source=1, destination=0, proportion=1.0), # P123 and O merge
]
# Simulate in msprime
ts = msprime.simulate(
samples=samples,
length=seq_len,
recombination_rate=r,
mutation_rate=mu,
population_configurations=population_configurations,
demographic_events=demographic_events,
num_replicates=num_reps,
random_seed=random_seed,
)
#print('done simulating')
return ts
Because of constraints on computing power for my local machine, I ran this scaled-down version in Spyder and saw that the Numba code was faster!
import sys
sys.path.append('/Users/davidpeede/Dropbox/Bioinformatics/HSscripts/introgressionSimStudy')
import fast_stats as fs
import introStatFuncs as isf
import msprime_models as mm
import time
slow_start = time.time()
null = mm.one_pulse(
seq_len=1e6,
num_seq=100,
N=100000,
tp123=3,
tp12=2,
tgf=1,
mu=1e-8,
r=7e-9,
gamma=0,
num_reps=100,
random_seed=42)
for i,tree in enumerate(null):
genotable = tree.genotype_matrix()
O = tree.get_samples(0)
P1 = tree.get_samples(1)
P2 = tree.get_samples(2)
P3 = tree.get_samples(3)
isf.D(genotable, O, P1, P2, P3)
isf.fd(genotable, O, P1, P2, P3)
isf.df(genotable, P1, P2, P3)
isf.D3(genotable, P1, P2, P3, 1e6)
isf.Dplus(genotable, O, P1, P2, P3)
test = mm.one_pulse(
seq_len=1e6,
num_seq=100,
N=100000,
tp123=3,
tp12=2,
tgf=1,
mu=1e-8,
r=7e-9,
gamma=0.1,
num_reps=100,
random_seed=42)
for i,tree in enumerate(test):
genotable = tree.genotype_matrix()
O = tree.get_samples(0)
P1 = tree.get_samples(1)
P2 = tree.get_samples(2)
P3 = tree.get_samples(3)
isf.D(genotable, O, P1, P2, P3)
isf.fd(genotable, O, P1, P2, P3)
isf.df(genotable, P1, P2, P3)
isf.D3(genotable, P1, P2, P3, 1e6)
isf.Dplus(genotable, O, P1, P2, P3)
slow_end = time.time()
print('slow time '+str(slow_end-slow_start))
fast_start = time.time()
dry_run = mm.one_pulse(
seq_len=100,
num_seq=2,
N=10,
tp123=3,
tp12=2,
tgf=1,
mu=1e-8,
r=1e-8,
gamma=0,
num_reps=1,
random_seed=42)
for i,tree in enumerate(dry_run):
genotable = tree.genotype_matrix()
O = tree.get_samples(0)
P1 = tree.get_samples(1)
P2 = tree.get_samples(2)
P3 = tree.get_samples(3)
fs.D(genotable, O, P1, P2, P3)
fs.fd(genotable, O, P1, P2, P3)
fs.df(genotable, P1, P2, P3)
fs.D3(genotable, P1, P2, P3, 100)
fs.Dplus(genotable, O, P1, P2, P3)
null = mm.one_pulse(
seq_len=1e6,
num_seq=100,
N=100000,
tp123=3,
tp12=2,
tgf=1,
mu=1e-8,
r=7e-9,
gamma=0,
num_reps=100,
random_seed=42)
for i,tree in enumerate(null):
genotable = tree.genotype_matrix()
O = tree.get_samples(0)
P1 = tree.get_samples(1)
P2 = tree.get_samples(2)
P3 = tree.get_samples(3)
fs.D(genotable, O, P1, P2, P3)
fs.fd(genotable, O, P1, P2, P3)
fs.df(genotable, P1, P2, P3)
fs.D3(genotable, P1, P2, P3, 1e6)
fs.Dplus(genotable, O, P1, P2, P3)
test = mm.one_pulse(
seq_len=1e6,
num_seq=100,
N=100000,
tp123=3,
tp12=2,
tgf=1,
mu=1e-8,
r=7e-9,
gamma=0.1,
num_reps=100,
random_seed=42)
for i,tree in enumerate(test):
genotable = tree.genotype_matrix()
O = tree.get_samples(0)
P1 = tree.get_samples(1)
P2 = tree.get_samples(2)
P3 = tree.get_samples(3)
fs.D(genotable, O, P1, P2, P3)
fs.fd(genotable, O, P1, P2, P3)
fs.df(genotable, P1, P2, P3)
fs.D3(genotable, P1, P2, P3, 1e6)
fs.Dplus(genotable, O, P1, P2, P3)
fast_end = time.time()
print('fast time '+str(fast_end-fast_start))
This code resulted in this output:
slow time 78.05009198188782
fast time 66.48152017593384
I then wrote a more realistic set of simulations to test on the HPC, which I called HPC_speed_check.py
import msprime
import sys
import numpy as np
from scipy import stats
import fast_stats as fs
import introStatFuncs as isf
import msprime_models as mm
import time
# sys.argv[1] = sequence length
# sys.argv[2] = number of haploid individuals
# sys.argv[3] = number of reps
# Run with the "slow" stats
slow_start = time.time()
null = mm.one_pulse(
seq_len=int(sys.argv[1]),
num_seq=int(sys.argv[2]),
N=1000000,
tp123=3,
tp12=2,
tgf=1,
mu=0.00000001,
r=0.000000007,
gamma=0,
num_reps=int(sys.argv[3]),
random_seed=42)
nullD = []
nullfd = []
nulldf = []
nullD3 = []
nullDplus = []
for i,tree in enumerate(null):
genotable = tree.genotype_matrix()
O = tree.get_samples(0)
P1 = tree.get_samples(1)
P2 = tree.get_samples(2)
P3 = tree.get_samples(3)
nullD.append(isf.D(genotable, O, P1, P2, P3))
nullfd.append(isf.fd(genotable, O, P1, P2, P3))
nulldf.append(isf.df(genotable, P1, P2, P3))
nullD3.append(isf.D3(genotable, P1, P2, P3, int(sys.argv[1])))
nullDplus.append(isf.Dplus(genotable, O, P1, P2, P3))
DNull = np.asarray(nullD)
fdNull = np.asarray(nullfd)
dfNull = np.asarray(nulldf)
D3Null = np.asarray(nullD3)
DplusNull = np.asarray(nullDplus)
DSE = (2*(stats.sem(DNull, nan_policy='omit')))
fdSE = (2*(stats.sem(fdNull, nan_policy='omit')))
dfSE = (2*(stats.sem(dfNull, nan_policy='omit')))
D3SE = (2*(stats.sem(D3Null, nan_policy='omit')))
DplusSE = (2*(stats.sem(DplusNull, nan_policy='omit')))
test = mm.one_pulse(
seq_len=int(sys.argv[1]),
num_seq=int(sys.argv[2]),
N=1000000,
tp123=3,
tp12=2,
tgf=1,
mu=0.00000001,
r=0.000000007,
gamma=0.1,
num_reps=int(sys.argv[3]),
random_seed=42)
DPOW = []
fdPOW =[]
dfPOW = []
D3POW = []
DplusPOW = []
D = np.asarray([])
fd = np.asarray([])
df = np.asarray([])
D3 = np.asarray([])
Dplus = np.asarray([])
for i,tree in enumerate(test):
genotable = tree.genotype_matrix()
O = tree.get_samples(0)
P1 = tree.get_samples(1)
P2 = tree.get_samples(2)
P3 = tree.get_samples(3)
D = np.append(D, isf.D(genotable, O, P1, P2, P3))
fd = np.append(fd, isf.fd(genotable, O, P1, P2, P3))
df = np.append(df, isf.df(genotable, P1, P2, P3))
D3 = np.append(D3, isf.D3(genotable, P1, P2, P3, int(sys.argv[1])))
Dplus = np.append(Dplus, isf.Dplus(genotable, O, P1, P2, P3))
DPOW.append(np.nanmean(D > DSE))
fdPOW.append(np.nanmean(fd > fdSE))
dfPOW.append(np.nanmean(df > dfSE))
D3POW.append(np.nanmean(D3 > D3SE))
DplusPOW.append(np.nanmean(Dplus > DplusSE))
slow_stop = time.time()
# Run with the "fast" stats
fast_start = time.time()
dry_run = mm.one_pulse(
seq_len=100,
num_seq=2,
N=10,
tp123=3,
tp12=2,
tgf=1,
mu=1e-8,
r=1e-8,
gamma=0,
num_reps=1,
random_seed=42)
for i,tree in enumerate(dry_run):
genotable = tree.genotype_matrix()
O = tree.get_samples(0)
P1 = tree.get_samples(1)
P2 = tree.get_samples(2)
P3 = tree.get_samples(3)
fs.D(genotable, O, P1, P2, P3)
fs.fd(genotable, O, P1, P2, P3)
fs.df(genotable, P1, P2, P3)
fs.D3(genotable, P1, P2, P3, int(sys.argv[1]))
fs.Dplus(genotable, O, P1, P2, P3)
null = mm.one_pulse(
seq_len=int(sys.argv[1]),
num_seq=int(sys.argv[2]),
N=1000000,
tp123=3,
tp12=2,
tgf=1,
mu=0.00000001,
r=0.000000007,
gamma=0,
num_reps=int(sys.argv[3]),
random_seed=42)
nullD = []
nullfd = []
nulldf = []
nullD3 = []
nullDplus = []
for i,tree in enumerate(null):
genotable = tree.genotype_matrix()
O = tree.get_samples(0)
P1 = tree.get_samples(1)
P2 = tree.get_samples(2)
P3 = tree.get_samples(3)
nullD.append(fs.D(genotable, O, P1, P2, P3))
nullfd.append(fs.fd(genotable, O, P1, P2, P3))
nulldf.append(fs.df(genotable, P1, P2, P3))
nullD3.append(fs.D3(genotable, P1, P2, P3, int(sys.argv[1])))
nullDplus.append(fs.Dplus(genotable, O, P1, P2, P3))
DNull = np.asarray(nullD)
fdNull = np.asarray(nullfd)
dfNull = np.asarray(nulldf)
D3Null = np.asarray(nullD3)
DplusNull = np.asarray(nullDplus)
DSE = (2*(stats.sem(DNull, nan_policy='omit')))
fdSE = (2*(stats.sem(fdNull, nan_policy='omit')))
dfSE = (2*(stats.sem(dfNull, nan_policy='omit')))
D3SE = (2*(stats.sem(D3Null, nan_policy='omit')))
DplusSE = (2*(stats.sem(DplusNull, nan_policy='omit')))
test = mm.one_pulse(
seq_len=int(sys.argv[1]),
num_seq=int(sys.argv[2]),
N=1000000,
tp123=3,
tp12=2,
tgf=1,
mu=0.00000001,
r=0.000000007,
gamma=0.1,
num_reps=int(sys.argv[3]),
random_seed=42)
DPOW = []
fdPOW =[]
dfPOW = []
D3POW = []
DplusPOW = []
D = np.asarray([])
fd = np.asarray([])
df = np.asarray([])
D3 = np.asarray([])
Dplus = np.asarray([])
for i,tree in enumerate(test):
genotable = tree.genotype_matrix()
O = tree.get_samples(0)
P1 = tree.get_samples(1)
P2 = tree.get_samples(2)
P3 = tree.get_samples(3)
D = np.append(D, fs.D(genotable, O, P1, P2, P3))
fd = np.append(fd, fs.fd(genotable, O, P1, P2, P3))
df = np.append(df, fs.df(genotable, P1, P2, P3))
D3 = np.append(D3, fs.D3(genotable, P1, P2, P3, int(sys.argv[1])))
Dplus = np.append(Dplus, fs.Dplus(genotable, O, P1, P2, P3))
DPOW.append(np.nanmean(D > DSE))
fdPOW.append(np.nanmean(fd > fdSE))
dfPOW.append(np.nanmean(df > dfSE))
D3POW.append(np.nanmean(D3 > D3SE))
DplusPOW.append(np.nanmean(Dplus > DplusSE))
fast_stop = time.time()
# Output times
print('slow time '+str(slow_stop-slow_start))
print('------------------------------------')
print('fast time '+str(fast_stop-fast_start))
I then executed this python script on the HPC using the following SLURM shell wrappers using 1 node and 1 core vs 1 node and 2 cores:
#!/bin/bash
for seqLen in 1000000; do for indv in 100; do for reps in 1000; do
sbatch -J speed_check -N 1 -n 1 -t 3-23 --mem=10G -o ${seqLen}len_${indv}indvs_${reps}reps_speed_check-%A.out -e ${seqLen}len_${indv}indvs_${reps}reps_speed_check-%A.err --mail-type=ALL --mail-user=david_peede@brown.edu --wrap="module load anaconda/2020.02; source /gpfs/runtime/opt/anaconda/2020.02/etc/profile.d/conda.sh; conda activate msprimenv; python HPC_speed_check.py ${seqLen} ${indv} ${reps} > one_core_seqLen${seqLen}_indvs${indv}_reps${reps}_speed_check.txt"
done; done; done
#!/bin/bash
for seqLen in 1000000; do for indv in 100; do for reps in 1000; do
sbatch -J speed_check -N 1 -n 2 -t 3-23 --mem=10G -o ${seqLen}len_${indv}indvs_${reps}reps_speed_check-%A.out -e ${seqLen}len_${indv}indvs_${reps}reps_speed_check-%A.err --mail-type=ALL --mail-user=david_peede@brown.edu --wrap="module load anaconda/2020.02; source /gpfs/runtime/opt/anaconda/2020.02/etc/profile.d/conda.sh; conda activate msprimenv; python HPC_speed_check.py ${seqLen} ${indv} ${reps} > two_core_seqLen${seqLen}_indvs${indv}_reps${reps}_speed_check.txt"
done; done; done
The output for the 1 node 1 core was:
slow time 6217.974141836166
fast time 6524.659375905991
And the output for the 1 node 2 cores was:
slow time 6295.47022819519
fast time 6583.939221382141
What was odd to me was that the same Numba code was faster on my local machine and that both the NumPy and Numba code was slower when I added an extra core.
Again I am very new to coding so I could be possibly making really simple mistakes that I don’t realize, but I would appreciate any advice!
Thanks again!
-Dave