Unconfused Optimum¶
This page is an example of excluding confuse factors from enzyme activity impact factors using API falsifypack, and the selection of mutants from random mutation library by Bayesian optimization, which is based on the causal graph verified previously.
Confuse Factors Exclusion¶
In [103]:
Copied!
import pandas as pd
import expAscribe as ea
priori = pd.read_csv('./data/Wetlab/partial_links.csv',header=None)
ea.array_to_gml(priori.to_numpy(),'./data/Wetlab/influence_path.gml',["binding energy","pocket volume","bottleneck radius","length", "enzymatic activity"])
import pandas as pd
import expAscribe as ea
priori = pd.read_csv('./data/Wetlab/partial_links.csv',header=None)
ea.array_to_gml(priori.to_numpy(),'./data/Wetlab/influence_path.gml',["binding energy","pocket volume","bottleneck radius","length", "enzymatic activity"])
In [104]:
Copied!
ana = ea.falsifypack('./data/Wetlab/augmented_data.csv',"./data/Wetlab/influence_path.gml",significance_level_=0.2,n_perm=100)
ana = ea.falsifypack('./data/Wetlab/augmented_data.csv',"./data/Wetlab/influence_path.gml",significance_level_=0.2,n_perm=100)
Test permutations of given graph: 100%|██████████| 100/100 [00:35<00:00, 2.81it/s]
In [3]:
Copied!
print(ana)
print(ana)
+-------------------------------------------------------------------------------------------------------+ | Falsificaton Summary | +-------------------------------------------------------------------------------------------------------+ | The given DAG is informative because 4 / 100 of the permutations lie in the Markov | | equivalence class of the given DAG (p-value: 0.04). | | The given DAG violates 15/16 LMCs and is better than 84.0% of the permuted DAGs (p-value: 0.16). | | Based on the provided significance level (0.2) and because the DAG is informative, | | we do not reject the DAG. | +-------------------------------------------------------------------------------------------------------+
In [93]:
Copied!
import networkx as nx
import matplotlib.pyplot as plt
G = nx.read_gml("./data/Wetlab/influence_path.gml")
isolated_nodes = [node for node, degree in dict(G.degree()).items() if degree == 0]
G.remove_nodes_from(isolated_nodes)
pos = nx.spring_layout(G, center=(0, 0))
nx.draw(G, pos, node_color='lightblue', edge_color='gray')
nx.draw_networkx_labels(G, pos)
plt.xlim(-1.5, 1.5)
plt.ylim(-1.5, 1.5)
plt.show()
import networkx as nx
import matplotlib.pyplot as plt
G = nx.read_gml("./data/Wetlab/influence_path.gml")
isolated_nodes = [node for node, degree in dict(G.degree()).items() if degree == 0]
G.remove_nodes_from(isolated_nodes)
pos = nx.spring_layout(G, center=(0, 0))
nx.draw(G, pos, node_color='lightblue', edge_color='gray')
nx.draw_networkx_labels(G, pos)
plt.xlim(-1.5, 1.5)
plt.ylim(-1.5, 1.5)
plt.show()
Bayesian Optimization¶
In [3]:
Copied!
data= pd.read_csv('../data/Wetlab/original_data.csv')
data = data.dropna()
data = data.select_dtypes(include=['number'])
data
data= pd.read_csv('../data/Wetlab/original_data.csv')
data = data.dropna()
data = data.select_dtypes(include=['number'])
data
Out[3]:
binding energy | pocket volume | bottleneck radius | length | enzymatic activity | |
---|---|---|---|---|---|
1 | -7.25 | 1441.0 | 2.0 | 5.7 | 85.65 |
2 | -6.77 | 1179.0 | 2.0 | 5.4 | 66.70 |
3 | -6.44 | 659.0 | 2.5 | 2.4 | 85.13 |
4 | -5.02 | 1267.0 | 2.2 | 3.4 | 93.89 |
5 | -6.34 | 1502.0 | 2.0 | 5.3 | 57.75 |
6 | -6.16 | 1262.0 | 2.1 | 4.9 | 49.01 |
7 | -6.34 | 1487.0 | 2.0 | 4.9 | 54.78 |
8 | -5.95 | 729.0 | 2.1 | 5.3 | 12.26 |
9 | -6.21 | 1229.0 | 2.0 | 4.9 | 40.33 |
10 | -6.34 | 653.0 | 2.5 | 2.4 | 99.75 |
11 | -6.26 | 1286.0 | 2.0 | 5.3 | 107.59 |
12 | -5.89 | 1414.0 | 2.0 | 4.9 | 66.61 |
13 | -5.23 | 1217.0 | 2.0 | 4.9 | 94.89 |
14 | -5.80 | 1517.0 | 2.0 | 5.3 | 32.86 |
15 | -6.47 | 1222.0 | 2.2 | 3.0 | 8.96 |
16 | -5.75 | 618.0 | 2.1 | 5.3 | 81.47 |
17 | -6.72 | 1234.0 | 2.0 | 4.9 | 10.30 |
18 | -5.59 | 1273.0 | 2.0 | 4.9 | 19.42 |
19 | -7.16 | 1250.0 | 2.0 | 5.4 | 50.65 |
20 | -5.22 | 661.0 | 2.5 | 2.8 | 37.47 |
21 | -6.36 | 647.0 | 2.1 | 5.3 | 36.04 |
22 | -3.83 | 1242.0 | 2.2 | 3.0 | 42.82 |
23 | -5.98 | 1176.0 | 2.0 | 4.9 | 92.43 |
In [4]:
Copied!
data_new = data[['pocket volume', 'length', ' enzymatic activity']].rename(columns={
'pocket volume': 'x',
'length': 'y',
' enzymatic activity': 'z'
})
data_new
data_new = data[['pocket volume', 'length', ' enzymatic activity']].rename(columns={
'pocket volume': 'x',
'length': 'y',
' enzymatic activity': 'z'
})
data_new
Out[4]:
x | y | z | |
---|---|---|---|
1 | 1441.0 | 5.7 | 85.65 |
2 | 1179.0 | 5.4 | 66.70 |
3 | 659.0 | 2.4 | 85.13 |
4 | 1267.0 | 3.4 | 93.89 |
5 | 1502.0 | 5.3 | 57.75 |
6 | 1262.0 | 4.9 | 49.01 |
7 | 1487.0 | 4.9 | 54.78 |
8 | 729.0 | 5.3 | 12.26 |
9 | 1229.0 | 4.9 | 40.33 |
10 | 653.0 | 2.4 | 99.75 |
11 | 1286.0 | 5.3 | 107.59 |
12 | 1414.0 | 4.9 | 66.61 |
13 | 1217.0 | 4.9 | 94.89 |
14 | 1517.0 | 5.3 | 32.86 |
15 | 1222.0 | 3.0 | 8.96 |
16 | 618.0 | 5.3 | 81.47 |
17 | 1234.0 | 4.9 | 10.30 |
18 | 1273.0 | 4.9 | 19.42 |
19 | 1250.0 | 5.4 | 50.65 |
20 | 661.0 | 2.8 | 37.47 |
21 | 647.0 | 5.3 | 36.04 |
22 | 1242.0 | 3.0 | 42.82 |
23 | 1176.0 | 4.9 | 92.43 |
In [ ]:
Copied!
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bayes_opt import BayesianOptimization, UtilityFunction
from matplotlib import cm
from PIL import Image
import io
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
'''
data_new = pd.DataFrame({
'x': np.random.uniform(-2, 10, 100),
'y': np.random.uniform(-2, 10, 100),
'z': np.sin(np.random.uniform(-2, 10, 100)) + np.cos(np.random.uniform(-2, 10, 100))
})
'''
X_train = data_new[['x', 'y']].values
y_train = data_new['z'].values
kernel = C(1.5, (1e-2, 1e2)) * RBF(0.5, (1e-2, 1e2))
gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10)
gp.fit(X_train, y_train)
def target_function(x, y):
X_test = np.array([[x, y]])
return gp.predict(X_test)[0]
x = np.linspace(0, 2000, 100)
y = np.linspace(-10, 25, 100)
X, Y = np.meshgrid(x, y)
grid = np.c_[X.ravel(), Y.ravel()]
Z = np.array([target_function(xi, yi) for xi, yi in grid])
Z = Z.reshape(X.shape)
optimizer = BayesianOptimization(
f=target_function,
pbounds={'x': (0, 2000), 'y': (0, 20)},
random_state=42,
verbose=1
)
acq_function = UtilityFunction(kind="ei", kappa=5)
optimizer.register({'x': 1000, 'y': 6}, target_function(1000, 6))
optimizer.register({'x': 1200, 'y': 5}, target_function(1200, 5))
def posterior(optimizer, x_obs, y_obs, grid):
optimizer._gp.fit(x_obs, y_obs)
mu, sigma = optimizer._gp.predict(grid, return_std=True)
return mu, sigma
def plot_gp(optimizer, X, Y, Z, step=0):
fig, axs = plt.subplots(2, 2, figsize=(18, 14))
grid = np.c_[X.ravel(), Y.ravel()]
x_obs = np.array([[res["params"]["x"], res["params"]["y"]] for res in optimizer.res])
y_obs = np.array([res["target"] for res in optimizer.res])
mu, sigma = posterior(optimizer, x_obs, y_obs, grid)
mu = mu.reshape(X.shape)
sigma = sigma.reshape(X.shape)
c1 = axs[0, 0].contourf(X, Y, Z, cmap='rainbow')
axs[0, 0].scatter(x_obs[:, 0], x_obs[:, 1], color='grey', label='Observations')
axs[0, 0].set_title("Target Function")
fig.colorbar(c1, ax=axs[0, 0])
c2 = axs[0, 1].contourf(X, Y, mu, cmap='coolwarm')
axs[0, 1].scatter(x_obs[:, 0], x_obs[:, 1], color='grey', label='Observations')
axs[0, 1].set_title(f"Gaussian Process Predicted Mean After {step} Steps")
fig.colorbar(c2, ax=axs[0, 1])
c3 = axs[1, 0].contourf(X, Y, sigma, cmap='coolwarm')
axs[1, 0].scatter(x_obs[:, 0], x_obs[:, 1], color='grey', label='Observations')
axs[1, 0].set_title(f"Gaussian Process Variance After {step} Steps")
fig.colorbar(c3, ax=axs[1, 0])
utility_function = UtilityFunction(kind="ei", kappa=5, xi=0)
utility = utility_function.utility(grid, optimizer._gp, 0).reshape(X.shape)
c4 = axs[1, 1].contourf(X, Y, utility, cmap='inferno')
axs[1, 1].scatter(x_obs[:, 0], x_obs[:, 1], color='grey', label='Observations')
axs[1, 1].set_title(f"Acquisition Function (EI) After {step} Steps")
fig.colorbar(c4, ax=axs[1, 1])
plt.tight_layout()
buf = io.BytesIO()
plt.savefig(buf, format='png')
plt.close(fig)
buf.seek(0)
return buf
images = []
optimizer.maximize(init_points=5, n_iter=25)
for step in range(1, 25):
optimizer.maximize(init_points=0, n_iter=1, acquisition_function=acq_function)
buf = plot_gp(optimizer, X, Y, Z, step)
image = Image.open(buf)
images.append(image)
gif_path = 'optimization_process.gif'
images[0].save(
gif_path, save_all=True, append_images=images[1:], duration=500, loop=0
)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bayes_opt import BayesianOptimization, UtilityFunction
from matplotlib import cm
from PIL import Image
import io
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
'''
data_new = pd.DataFrame({
'x': np.random.uniform(-2, 10, 100),
'y': np.random.uniform(-2, 10, 100),
'z': np.sin(np.random.uniform(-2, 10, 100)) + np.cos(np.random.uniform(-2, 10, 100))
})
'''
X_train = data_new[['x', 'y']].values
y_train = data_new['z'].values
kernel = C(1.5, (1e-2, 1e2)) * RBF(0.5, (1e-2, 1e2))
gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=10)
gp.fit(X_train, y_train)
def target_function(x, y):
X_test = np.array([[x, y]])
return gp.predict(X_test)[0]
x = np.linspace(0, 2000, 100)
y = np.linspace(-10, 25, 100)
X, Y = np.meshgrid(x, y)
grid = np.c_[X.ravel(), Y.ravel()]
Z = np.array([target_function(xi, yi) for xi, yi in grid])
Z = Z.reshape(X.shape)
optimizer = BayesianOptimization(
f=target_function,
pbounds={'x': (0, 2000), 'y': (0, 20)},
random_state=42,
verbose=1
)
acq_function = UtilityFunction(kind="ei", kappa=5)
optimizer.register({'x': 1000, 'y': 6}, target_function(1000, 6))
optimizer.register({'x': 1200, 'y': 5}, target_function(1200, 5))
def posterior(optimizer, x_obs, y_obs, grid):
optimizer._gp.fit(x_obs, y_obs)
mu, sigma = optimizer._gp.predict(grid, return_std=True)
return mu, sigma
def plot_gp(optimizer, X, Y, Z, step=0):
fig, axs = plt.subplots(2, 2, figsize=(18, 14))
grid = np.c_[X.ravel(), Y.ravel()]
x_obs = np.array([[res["params"]["x"], res["params"]["y"]] for res in optimizer.res])
y_obs = np.array([res["target"] for res in optimizer.res])
mu, sigma = posterior(optimizer, x_obs, y_obs, grid)
mu = mu.reshape(X.shape)
sigma = sigma.reshape(X.shape)
c1 = axs[0, 0].contourf(X, Y, Z, cmap='rainbow')
axs[0, 0].scatter(x_obs[:, 0], x_obs[:, 1], color='grey', label='Observations')
axs[0, 0].set_title("Target Function")
fig.colorbar(c1, ax=axs[0, 0])
c2 = axs[0, 1].contourf(X, Y, mu, cmap='coolwarm')
axs[0, 1].scatter(x_obs[:, 0], x_obs[:, 1], color='grey', label='Observations')
axs[0, 1].set_title(f"Gaussian Process Predicted Mean After {step} Steps")
fig.colorbar(c2, ax=axs[0, 1])
c3 = axs[1, 0].contourf(X, Y, sigma, cmap='coolwarm')
axs[1, 0].scatter(x_obs[:, 0], x_obs[:, 1], color='grey', label='Observations')
axs[1, 0].set_title(f"Gaussian Process Variance After {step} Steps")
fig.colorbar(c3, ax=axs[1, 0])
utility_function = UtilityFunction(kind="ei", kappa=5, xi=0)
utility = utility_function.utility(grid, optimizer._gp, 0).reshape(X.shape)
c4 = axs[1, 1].contourf(X, Y, utility, cmap='inferno')
axs[1, 1].scatter(x_obs[:, 0], x_obs[:, 1], color='grey', label='Observations')
axs[1, 1].set_title(f"Acquisition Function (EI) After {step} Steps")
fig.colorbar(c4, ax=axs[1, 1])
plt.tight_layout()
buf = io.BytesIO()
plt.savefig(buf, format='png')
plt.close(fig)
buf.seek(0)
return buf
images = []
optimizer.maximize(init_points=5, n_iter=25)
for step in range(1, 25):
optimizer.maximize(init_points=0, n_iter=1, acquisition_function=acq_function)
buf = plot_gp(optimizer, X, Y, Z, step)
image = Image.open(buf)
images.append(image)
gif_path = 'optimization_process.gif'
images[0].save(
gif_path, save_all=True, append_images=images[1:], duration=500, loop=0
)