Reproducibility 2: figure 3 & SI3

  • Extrapolation with un-seen mutations and their measured phenotypes

  • Test on biological model and exponential model (see src_simulated_data/readme.md for more details)

Load packages

[ ]:
import sys
sys.path.append('../')
from dlim.model import DLIM
from dlim.dataset import Data_model
from dlim.api import DLIM_API
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, spearmanr
from numpy.random import choice
from src_simulate_data.sim_data import Simulated
import numpy as np
from sklearn.metrics import r2_score
import matplotlib.patches as mpatches

Test on mechanistic model

[ ]:
# Simulate data
type_f = "bio"
nb_var = 30
data_simulated = Simulated(nb_var, type_f)
data = Data_model(data=pd.DataFrame(data_simulated.data), n_variables=2)

# choose the region of training
thres = 1.2
A_id = [i for i, el  in enumerate(data_simulated.A) if el >= thres]
B_id = [i for i, el  in enumerate(data_simulated.B) if el >= thres]
nA_id = [i for i, el  in enumerate(data_simulated.A) if i not in A_id]
nB_id = [i for i, el  in enumerate(data_simulated.B) if i not in B_id]

# Choose training and validation data
train_id = [i for i, el  in enumerate(data_simulated.data) if el[0] in A_id and el[1] in B_id]
val_id = [i for i in range(data_simulated.data.shape[0]) if i not in train_id]

train_data = data.subset(train_id)
val_data = data.subset(val_id)

# Develop DLIM model
model = DLIM(n_variables = train_data.nb_val, hid_dim = 32, nb_layer = 0)
dlim_regressor = DLIM_API(model=model, flag_spectral=True)

# Train DLIM model
losses = dlim_regressor.fit(train_data, lr = 1e-3, nb_epoch=300, batch_size=32, emb_regularization=0, \
                            save_path= None)
# Predict fitness, variance and phenotype
fit_v, vari_v, lat_v = dlim_regressor.predict(val_data.data[:,:-1], detach=True)
fit_t, var_t, lat_t = dlim_regressor.predict(train_data.data[:, :-1], detach=True)

spectral gap = 0.6666667461395264
spectral gap = 0.6999997496604919
[ ]:
fig, ax = plt.subplots(1, figsize=(2.5, 2.5))
# original landscape
data_simulated.plot(ax)

# Data in training set

ax.scatter(data_simulated.A[data.data[train_id, 0].int()], data_simulated.B[data.data[train_id, 1].int()], s=2, marker="o", c="black")
ax.scatter(data_simulated.A[data.data[val_id, 0].int()], data_simulated.B[data.data[val_id, 1].int()], s=2, marker="o", c="white")
for el in ["top", "right"]:
    ax.spines[el].set_visible(False)
plt.tight_layout()
plt.savefig(f"./img/fig3_{type_f}_land_data.png", dpi=300, transparent=True)
plt.show()

# Infered landscape and infered coordinates for training data
fig, bx = plt.subplots(1, figsize=(2.5, 2.5))
dlim_regressor.plot(bx, data)
bx.scatter(dlim_regressor.model.genes_emb[0][data.data[train_id, 0].long()].detach(),
          dlim_regressor.model.genes_emb[1][data.data[train_id, 1].long()].detach(),
           c=data.data[train_id, -1], s=2, cmap="bwr", marker="x")
for el in ["top", "right"]:
    bx.spines[el].set_visible(False)
plt.tight_layout()
plt.savefig(f"./img/fig3_{type_f}_land_pred.png", dpi=300, transparent=True)
plt.show()

# Infered phenotypes for testing data based on polynomial fitting
dlim_regressor.model.train_convert(A_id, data_simulated.A[A_id], 0)
dlim_regressor.model.train_convert(B_id, data_simulated.B[B_id], 1)
# Update the embedding, so the unseen data also gets an infered phenotype
dlim_regressor.model.update_emb(nA_id, data_simulated.A[nA_id], 0)
dlim_regressor.model.update_emb(nB_id, data_simulated.B[nB_id], 1)

# Visualize the fitting on phenotype
fig, (ax, bx) = plt.subplots(1, 2, figsize=(5, 2.5))
ax.scatter(dlim_regressor.model.genes_emb[0][A_id].detach(), data_simulated.A[A_id], c="black", s=20)
ax.scatter(dlim_regressor.model.genes_emb[0][nA_id].detach(), data_simulated.A[nA_id], c="orange", s=20)
ax.plot(np.polyval(dlim_regressor.model.conversion[0], np.linspace(0, 5, 100)), np.linspace(0, 5, 100), linewidth=1, linestyle="--", c="grey")
bx.scatter(dlim_regressor.model.genes_emb[1][B_id].detach(), data_simulated.B[B_id], c="black", s=20)
bx.scatter(dlim_regressor.model.genes_emb[1][nB_id].detach(), data_simulated.B[nB_id], c="orange", s=20)
bx.plot(np.polyval(dlim_regressor.model.conversion[1], np.linspace(0, 5, 100)), np.linspace(0, 5, 100), linewidth=1, linestyle="--", c="grey")
ax.set_xlabel("$\\varphi_1$")
ax.set_ylabel("$\\phi_1$")
bx.set_xlabel("$\\varphi_2$")
bx.set_ylabel("$\\phi_2$")
for el in ["top", "right"]:
    ax.spines[el].set_visible(False)
    bx.spines[el].set_visible(False)
plt.tight_layout()
plt.savefig(f"./img/fig3_{type_f}_cor_bio.png", dpi=300, transparent=True)
plt.show()

# Comparison of the prediction quality (polynomial fitting vs random)

fit_n, var_n, lat_n = dlim_regressor.predict(val_data.data[:, :-1], detach=True)
val_data_array = val_data.data[:, [-1]].numpy()
fig, ax = plt.subplots(1, figsize=(2.5, 2.5))
score = ((fit_n.flatten() -val_data_array.flatten())**2).mean()
score_v = ((fit_v.flatten() - val_data_array.flatten())**2).mean()

r2 = r2_score(fit_n.flatten(), val_data_array)
r2_v = r2_score(fit_v.flatten(), val_data_array)

ax.scatter(fit_n, val_data_array, s=2, c="orange", label=f"MSE$={score:.2f}$, $\\rho$ = {r2: 0.2f}")
ax.scatter(fit_v, val_data_array, s=2, c="grey", label=f"MSE$={score_v:.2f}$, $\\rho$ = {r2_v: 0.2f}")

ax.set_xlabel("$\\hat{F}$")
ax.set_ylabel("$F^{obs}$")
for el in ["top", "right"]:
    ax.spines[el].set_visible(False)
ax.legend(frameon=False, ncol=1, fontsize=9)
plt.tight_layout()
plt.savefig(f"./img/fig3_{type_f}_new_fit.svg", dpi=300, transparent=True)
plt.show()
../_images/reproducibility_figure3_5_0.png
../_images/reproducibility_figure3_5_1.png
../_images/reproducibility_figure3_5_2.png
../_images/reproducibility_figure3_5_3.png

Test on exponential model

[ ]:
# Load data
type_f = "exp"
nb_var = 30
data_simulated = Simulated(nb_var, type_f)
data = Data_model(data=pd.DataFrame(data_simulated.data), n_variables=2)

# Choose the region of training
thres = 1.2
A_id = [i for i, el  in enumerate(data_simulated.A) if el >= 2.7 or el < 1.5]
B_id = [i for i, el  in enumerate(data_simulated.B) if el >= 2.7 or el < 1.5]
nA_id = [i for i, el  in enumerate(data_simulated.A) if i not in A_id]
nB_id = [i for i, el  in enumerate(data_simulated.B) if i not in B_id]

# Split data into training and validation

train_id = [i for i, el  in enumerate(data_simulated.data) if el[0] in A_id and el[1] in B_id]
val_id = [i for i in range(data_simulated.data.shape[0]) if i not in train_id]

train_data = data.subset(train_id)
val_data = data.subset(val_id)

# Develop DLIM model
model = DLIM(n_variables = train_data.nb_val, hid_dim = 32, nb_layer = 0)
dlim_regressor = DLIM_API(model=model, flag_spectral=True)

# Train DLIM model
losses = dlim_regressor.fit(train_data, lr = 1e-3, nb_epoch=300, batch_size=32, emb_regularization=0, \
                            save_path= None)

# Predict fitness, variance and phenotype
fit_v, vari_v, lat_v = dlim_regressor.predict(val_data.data[:,:-1], detach=True)
fit_t, var_t, lat_t = dlim_regressor.predict(train_data.data[:, :-1], detach=True)

spectral gap = 0.76666659116745
spectral gap = 0.8000002503395081
[ ]:
fig, ax = plt.subplots(1, figsize=(2.5, 2.5))
# original landscape
data_simulated.plot(ax)

# Data in training set
ax.scatter(data_simulated.A[data.data[train_id, 0].int()], data_simulated.B[data.data[train_id, 1].int()], s=2, marker="o", c="black")
ax.scatter(data_simulated.A[data.data[val_id, 0].int()], data_simulated.B[data.data[val_id, 1].int()], s=2, marker="o", c="white")

for el in ["top", "right"]:
    ax.spines[el].set_visible(False)
plt.tight_layout()
plt.savefig(f"./img/fig_s3_{type_f}_land_data.png", dpi=300, transparent=True)
plt.show()

# Infered landscape and infered coordinates for training data
fig, bx = plt.subplots(1, figsize=(2.5, 2.5))
dlim_regressor.plot(bx, data)
bx.scatter(dlim_regressor.model.genes_emb[0][data.data[train_id, 0].long()].detach(),
          dlim_regressor.model.genes_emb[1][data.data[train_id, 1].long()].detach(),
           c=data.data[train_id, -1], s=2, cmap="bwr", marker="x")
for el in ["top", "right"]:
    bx.spines[el].set_visible(False)
plt.tight_layout()
plt.savefig(f"./img/fig_s3_{type_f}_land_pred.png", dpi=300, transparent=True)
plt.show()

# Infered phenotypes for testing data based on polynomial fitting
dlim_regressor.model.train_convert(A_id, data_simulated.A[A_id], 0)
dlim_regressor.model.train_convert(B_id, data_simulated.B[B_id], 1)

# Update the embedding, so the unseen data also gets an infered phenotype
dlim_regressor.model.update_emb(nA_id, data_simulated.A[nA_id], 0)
dlim_regressor.model.update_emb(nB_id, data_simulated.B[nB_id], 1)

# Visualize the fitting on phenotype
fig, (ax, bx) = plt.subplots(1, 2, figsize=(5, 2.5))
ax.scatter(dlim_regressor.model.genes_emb[0][A_id].detach(), data_simulated.A[A_id], c="black", s=20)
ax.scatter(dlim_regressor.model.genes_emb[0][nA_id].detach(), data_simulated.A[nA_id], c="orange", s=20)
ax.plot(np.polyval(dlim_regressor.model.conversion[0], np.linspace(0, 5, 100)), np.linspace(0, 5, 100), linewidth=1, linestyle="--", c="grey")
bx.scatter(dlim_regressor.model.genes_emb[1][B_id].detach(), data_simulated.B[B_id], c="black", s=20)
bx.scatter(dlim_regressor.model.genes_emb[1][nB_id].detach(), data_simulated.B[nB_id], c="orange", s=20)
bx.plot(np.polyval(dlim_regressor.model.conversion[1], np.linspace(0, 5, 100)), np.linspace(0, 5, 100), linewidth=1, linestyle="--", c="grey")
ax.set_xlabel("$\\varphi_1$")
ax.set_ylabel("$\\phi_1$")
bx.set_xlabel("$\\varphi_2$")
bx.set_ylabel("$\\phi_2$")
for el in ["top", "right"]:
    ax.spines[el].set_visible(False)
    bx.spines[el].set_visible(False)
plt.tight_layout()
plt.savefig(f"./img/fig_s3_{type_f}_cor_bio.png", dpi=300, transparent=True)
plt.show()

# Comparison of the prediction quality (polynomial fitting vs random)

fit_n, var_n, lat_n = dlim_regressor.predict(val_data.data[:, :-1], detach=True)
val_data_array = val_data.data[:, [-1]].numpy()
fig, ax = plt.subplots(1, figsize=(2.5, 2.5))
score = ((fit_n.flatten() -val_data_array.flatten())**2).mean()
score_v = ((fit_v.flatten() - val_data_array.flatten())**2).mean()

r2 = r2_score(fit_n.flatten(), val_data_array)
r2_v = r2_score(fit_v.flatten(), val_data_array)

ax.scatter(fit_n, val_data_array, s=2, c="orange", label=f"MSE$={score:.2f}$, $\\rho$ = {r2: 0.2f}")
ax.scatter(fit_v, val_data_array, s=2, c="grey", label=f"MSE$={score_v:.2f}$, $\\rho$ = {r2_v: 0.2f}")

ax.set_xlabel("$\\hat{F}$")
ax.set_ylabel("$F^{obs}$")
for el in ["top", "right"]:
    ax.spines[el].set_visible(False)
ax.legend(frameon=False, ncol=1, fontsize=9)
plt.tight_layout()
plt.savefig(f"./img/fig_s3_{type_f}new_fit.svg", dpi=300, transparent=True)
plt.show()
../_images/reproducibility_figure3_8_0.png
../_images/reproducibility_figure3_8_1.png
../_images/reproducibility_figure3_8_2.png
../_images/reproducibility_figure3_8_3.png
[ ]: