November 28, 2019
This tutorial serves as a guide for creating a secure web server on BioLib. It comes with some tweakable presets that does a few things for you:
Adjust code in the following app so it meets whatever requirement your workflow have. Then export this to BioLib so that other scientists can use it to easily create Manhattan plots of their data and spot statistically significant genes from their experiments.
We use sample data found here: https://raw.githubusercontent.com/Pudkip/Pyhattan/master/data/China_Pharm.csv
Number of samples = 3352
Attributes:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import argparse
import sys
import base64
# utility function to parse csv
def format_data_from_csv_buffer(buffer, sep ='\t', chromosome ='chr', p_value ='p_wald'):
data = pd.read_table(buffer, sep = sep)
data['-log10(p_value)'] = -np.log10(data[p_value])
data[chromosome] = data[chromosome].astype('category')
data['ind'] = range(len(data))
data_grouped = data.groupby((chromosome))
return data, data_grouped
# utility function to generate plot
def generate_manhattan(pyhattan_object, export_path = None, significance = 6, colors = ['#E24E42', '#008F95'], ref_snp = False, significant_genes=[]):
data = pyhattan_object[0]
data_grouped = pyhattan_object[1]
fig = plt.figure()
ax = fig.add_subplot(111)
x_labels = []
x_labels_pos = []
for num, (name, group) in enumerate(data_grouped):
group.plot(kind='scatter', x='ind', y='-log10(p_value)', color=colors[num % len(colors)], ax=ax, s= 10000/len(data))
x_labels.append(name)
x_labels_pos.append((group['ind'].iloc[-1] - (group['ind'].iloc[-1] - group['ind'].iloc[0]) / 2))
ax.set_xticks(x_labels_pos)
ax.set_xticklabels(x_labels)
ax.set_xlim([0, len(data)])
ax.set_ylim([0, data['-log10(p_value)'].max() + 1])
ax.set_xlabel('Chromosome')
plt.axhline(y=significance, color='gray', linestyle='-', linewidth = 0.5)
plt.xticks(fontsize=8, rotation=60)
plt.yticks(fontsize=8)
if ref_snp:
for index, row in data.iterrows():
if row['-log10(p_value)'] >= significance:
ax.annotate(str(row[ref_snp]), xy = (index, row['-log10(p_value)']))
significant_genes.append(str(row[ref_snp]))
if export_path:
plt.savefig(export_path)
return plt
# load significance from input parameter, or use 2.5 if none provided
parser = argparse.ArgumentParser()
parser.add_argument('--significance', dest='significance', default=2.5)
args = parser.parse_args()
# define some variables
title = 'Title'
description = 'This is a description'
ncbi_url = "https://www.ncbi.nlm.nih.gov/snp/?term="
# load input data
csv_binary = io.BytesIO(sys.stdin.read().encode())
data = format_data_from_csv_buffer(csv_binary, sep=',', chromosome='chromosome', p_value='p_value')
# generate plot
significant_genes = []
generate_manhattan(data, export_path='example.png', significance=float(args.significance), ref_snp='refSNP', significant_genes=significant_genes)
file_data = open("example.png", "rb").read()
data = base64.b64encode(file_data).decode('ascii')
markdown_of_plot = ''.format('image/png', data)
# generate info message
if len(significant_genes)>0:
result_text = f'The following genes was found significant with p > {args.significance} \n'
else:
result_text= f'WARNING: No genes had any significant p values > {args.significance} !'
# generate list of significant genes
significant_genes_list = f"- `{ncbi_url}" + f"`\n- `{ncbi_url}".join(significant_genes) + "`"
# generate final markdown report
report = f"# {title} \n {markdown_of_plot} \n ## Description \n {description} \n ## Results \n {result_text} \n{significant_genes_list}"
# print the report
print(report)
This python app runs on BioLib via WebAssembly. This means that the code block above is send directly to the users browser and executed there. None of the users data has to actually leave their computer, which gives a high degree of privacy.