DeepPseudoMSI tutorial

PseudoMS image converter

The pseudo-MS image converter is designed and developed to convert the LC-MS-based untargeted metabolomics raw data to pseudo-MS images. The functions was written by R and in the R package pseudomsir. So please install this package first.

You can install pseudomsir from GitLab

Open the R and then type below code.

if(!require(remotes)){
install.packages("remotes")
}
remotes::install_gitlab("jaspershen/pseudomsir")

or GitHub

remotes::install_github("deepPseudoMSI-project/pseudomsir")

Convert mass spectrometry data to pseudoMS image

Please convert your mass spectrometry raw data to mzXML or mzML format first using msconvert or R package massconverter.

Then use the convert_raw_data2pseudoms_image function from pseudomsir package to convert mzMXL or mzML format to images.

library(pseudomsir)

Then put the mzXML data in the demo_data folder.

convert_raw_data2pseudoms_image(file_name = "demo_data/QCP11.mzXML")

And then one image (png) with the same name of mzXML file will be put in the same folder.

Data augmentation for the training dataset

We developed an augmentation strategy to simulate pseudo-MS images for training. We also use the convert_raw_data2pseudoms_image function from pseudomsir package, but we need to set the shift parameters for them. For each image, we could get several shifted images, which could be used for training.

convert_raw_data2pseudoms_image(file_name = "demo_data/QCP11.mzXML",
                                mz_shift = TRUE, 
                                rt_shift = TRUE, 
                                rt_diff = 10, 
                                int_shift = TRUE, 
                                int_times = 1.1)

Pseudo-MS image predictor

The Pseudo-MS image predictor is written by Python, so please open the python and use the below code for analysis.

The image_reader.py is used to read the image. This can be found in the github (https://github.com/jaspershen/deepPseudoMSI/tree/main/code/pseudoMS-image-predictor).

#### we referenced code from: https://github.com/ignacio-rocco/cnngeometric_pytorch

import tensorflow as tf
import numpy as np
import cv2
import matplotlib.pyplot as plt
import scipy.misc
from skimage import io
import pandas as pd
import os
tf.keras.backend.set_floatx('float32')

### the output of each cost function is a tensor of shape TensorShape([batch_size])
def image_reader(csv_file, image_dir, output_shape = (224,224)):
    data = pd.read_csv(csv_file)
    image_names = data.iloc[:,0] 
    g_stages = data.iloc[:,1]
    weeks_to_dlvrys = data.iloc[:,2]
    classes = data.iloc[:,3]
    
  

    num_of_images = len(image_names)
    image = np.zeros([num_of_images,output_shape[0],output_shape[1],3])

    for idx in range(0,num_of_images):
        image_path = os.path.join(image_dir,image_names[idx])
        
        print(image_path)
        
        image_2d = cv2.imread(image_path)
        image_2d = cv2.resize(image_2d, output_shape)
        image[idx,:,:,:] = image_2d
        image = image.astype('float32')
        g_stages = g_stages.astype('float32')
        weeks_to_dlvrys = weeks_to_dlvrys.astype('float32')
        

        stages_array = np.zeros(g_stages.shape[0])
        for i in range(0,g_stages.shape[0]):
            stages_array[i] = g_stages[i]
            
        delivery_array = np.zeros(weeks_to_dlvrys.shape[0])
        for i in range(0,weeks_to_dlvrys.shape[0]):
            delivery_array[i] = weeks_to_dlvrys[i]
        
        
        classes_array = np.zeros(classes.shape[0])
        for i in range(0,classes.shape[0]):
            classes_array[i] = classes[i]
        
        stages_array = np.reshape(stages_array,(g_stages.shape[0],1))
        delivery_array = np.reshape(delivery_array,(weeks_to_dlvrys.shape[0],1))
        classes_array = np.reshape(classes_array,(classes.shape[0],1))

    
    dataset = {'image': image, 'g_stages': stages_array, 'delivery': delivery_array, 'classes': classes_array }
    
    return dataset

The train.py is used to do the traning (https://github.com/jaspershen/deepPseudoMSI/tree/main/code/pseudoMS-image-predictor).

from __future__ import print_function

import os
import sys
from argparse import ArgumentParser
from time import time

import pandas as pd
import tensorflow as tf
import numpy as np
import cv2
import matplotlib.pyplot as plt
import scipy.misc
from tensorflow.keras.optimizers import Adam

### import self-defined functions
from model import *
from image_reader import *
from loss import *


tf.keras.backend.set_floatx('float32')

def build_parser():
    parser = ArgumentParser()
    
    # Paths
    parser.add_argument('--image-dir', type=str, default='../denmark_rplc_pos_224_224/', help='path to foler of training images')
    parser.add_argument('--train-file', type=str, default='../denmark_rplc_pos_224_224/train_all_shifts.csv', help='path to csv file of training/testing examples')
    parser.add_argument('--trained-model-dir', type=str, default='../trained_models/', help='path to trained models folder')
    parser.add_argument('--trained-model-fn', type=str, default='model_224x224_fold5', help='trained model filename')
    parser.add_argument('--result-name', type=str, default='../trained_models/results_224x224_fold5.csv', help='directory to store registration results')
    # Optimization parameters 
    parser.add_argument('--lr', type=float, default=0.0001, help='learning rate')
    parser.add_argument('--num-epochs', type=int, default=100, help='number of training epochs')
    parser.add_argument('--batch-size', type=int, default=8, help='training batch size')
    parser.add_argument('--image-size', type=int, default=224, help='size of image used for training and testing')
    parser.add_argument('--gpu-id', type=int, default=0, help='training batch size')
    # Model parameters
    parser.add_argument('--feature-cnn', type=str, default='vgg16', help='Feature extraction network: vgg16/resnet101')
    
    return parser
   
   
def main():

    parser = build_parser()
    args = parser.parse_args()

    devices = tf.config.experimental.list_physical_devices('GPU')
    for device in devices:
        tf.config.experimental.set_memory_growth(device, True)
    tf.config.experimental.set_visible_devices(devices[args.gpu_id], 'GPU')
    
    
    train_losses = np.zeros(args.num_epochs)
    validation_losses = np.zeros(args.num_epochs)
    data = pd.read_csv(args.train_file)
    
    dataset = image_reader(args.train_file, args.image_dir, output_shape = (args.image_size,args.image_size))

    image = dataset['image']
    g_stages = dataset['g_stages']
    delivery = dataset['delivery']
    classes = dataset['classes']

    
    index_train = np.where(classes.reshape(classes.shape[0],)!=5)
    index_val = np.where(classes.reshape(classes.shape[0],)==5)

    imgae_train = image[index_train,:,:,:]
    g_stages_train = g_stages[index_train]
    image_test = image[index_val,:,:,:]
    g_stages_test = g_stages[index_val]
    
   
    num_of_train_images = g_stages_train.shape[0]
    num_of_validation_images = g_stages_test.shape[0]
    
    imgae_train = imgae_train.reshape((num_of_train_images,image.shape[1],image.shape[2],image.shape[3]))
    image_test = image_test.reshape((num_of_validation_images,image.shape[1],image.shape[2],image.shape[3]))
    
    
    #num_of_train_images = 2524
    #num_of_validation_images = 3000 - 2524
    #imgae_train = image[0:num_of_train_images,:,:,:]
    #g_stages_train = g_stages[0:num_of_train_images]
    #image_test = image[num_of_train_images:num_of_train_images + num_of_validation_images,:,:,:]
    #g_stages_test = g_stages[num_of_train_images:num_of_train_images + num_of_validation_images]
    

    
    print(imgae_train.shape)
    print(g_stages_train.shape)
    print(image_test.shape)
    print(g_stages_test.shape)
    
    input_shape = image.shape[1:4]
    model = reg_net(input_shape,feature_cnn=args.feature_cnn)
    model.summary()
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=args.lr)

    for epoch in range(1,args.num_epochs+1):
        optimizer = tf.keras.optimizers.Adam(learning_rate=np.power(0.98,epoch)*args.lr)
        num_of_batches = int(num_of_train_images/args.batch_size)
        s = 0
        for idx in range(0,num_of_batches):
            
            batch_idx = np.random.randint(num_of_train_images, size=args.batch_size)
            
            image_batch = imgae_train[batch_idx, :]
            stage_batch = g_stages_train[batch_idx]

            with tf.GradientTape() as tape:
                    
                g_stage_predicted = model(image_batch)
                    
                loss = losses(stage_batch,g_stage_predicted)

            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients,model.trainable_variables))
                
                ### sum up training loss
            s = s + loss.numpy()
            
            
        ### compute validationing loss
        loss_validation = 0
        g_stage_validation_predicted_array = np.zeros((num_of_validation_images,1))
        
        for i in range(0,num_of_validation_images):
            
            test_image = image_test[i,:,:,:]
            test_image = test_image.reshape((1,test_image.shape[0],test_image.shape[1],test_image.shape[2]))
            test_stage = g_stages_test[i]
        
            g_stage_validation_predicted = model(test_image)
        
    
            loss_validation = losses(test_stage,g_stage_validation_predicted) + loss_validation
            
            g_stage_validation_predicted_array[i] = g_stage_validation_predicted
            
        loss_validation = loss_validation/num_of_validation_images
        loss_validation = np.sqrt(loss_validation)
        
            
        print("epoch= " + str(epoch) + ",  train loss = " + str(format(np.sqrt(s/num_of_batches), '.3f')) +
        ",   validation loss = " + str(format(loss_validation, '.3f')))
            
        train_losses[epoch-1] = s/num_of_batches
        
        validation_losses[epoch-1] = loss_validation
            

    # save model for each image resolution
    model.save(args.trained_model_dir + args.trained_model_fn + '.h5')
    
    print('done!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
    array = np.empty((args.num_epochs + 10,3), dtype='U25')
    
    array[0,0] = "epoch"
    array[0,1] = "train_loss"
    array[0,2] = "validation_loss"
    
    for j in range(0,args.num_epochs):
        array[1 + j , 0] = str(j+1)
        array[1 + j , 1] = str(train_losses[j])
        array[1 + j , 2] = str(validation_losses[j])
    np.savetxt(args.result_name, array, delimiter=",", fmt='%s')
    

    
    np.savetxt('../trained_models/predicted_224x224_fold5.csv', np.concatenate((g_stages_test,g_stage_validation_predicted_array), axis=1), delimiter=",", fmt='%s')
    
    
if __name__ == '__main__':
    main()

The model.py is used to set the deep learning method.

import tensorflow as tf
import tensorflow.keras.layers as KL
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import regularizers
tf.keras.backend.set_floatx('float32')
 
def reg_net(input_shape, feature_cnn='vgg16'):
    x_in = Input(input_shape)
    
    if feature_cnn == 'vgg16' :
        model = tf.keras.Sequential()
        vgg16 = tf.keras.applications.VGG16(include_top=False, weights = 'imagenet', input_shape = input_shape)
            ### cropped at forth pooling layer, replace maximum pooling with global average pooling
        for i in range(0,13):
            vgg16.layers[i].trainable = False
            model.add(vgg16.layers[i])
        for i in range(13,14):
            model.add(vgg16.get_layer(index=i))
            #model.add(Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu", kernel_regularizer=regularizers.l2(0.001)))
            model.add(tf.keras.layers.Dropout(0.5))
        model.add(tf.keras.layers.GlobalAveragePooling2D())
        
        x = model(x_in)
        
    size = x.shape[1]
    factor = (size)** (1. / 3)
        
    fcl_model =  tf.keras.Sequential()
    fcl_model.add(Dense(int(size/factor), input_shape=(size,), activation=tf.nn.relu,kernel_regularizer=regularizers.l2(1)))
    fcl_model.add(Dense(int(size/(factor*factor)), activation=tf.nn.relu, kernel_regularizer=regularizers.l2(1)))
    #fcl_model.add(Dense(int(size/factor), input_shape=(size,), activation=tf.nn.relu))
    #fcl_model.add(Dense(int(size/(factor*factor)), activation=tf.nn.relu))
    fcl_model.add(Dense(1))    
    
    y_out = fcl_model(x)
  
    
    return Model(inputs = x_in, outputs = y_out)

And the test.py is used to the test.

from __future__ import print_function

import os
import sys
from argparse import ArgumentParser
from time import time

import pandas as pd
import tensorflow as tf
import numpy as np
import cv2
import matplotlib.pyplot as plt
import scipy.misc
from tensorflow.keras.optimizers import Adam
import glob

### import self-defined functions
from model import *
from image_reader import *
from loss import *

tf.keras.backend.set_floatx('float32')


def build_parser():
    parser = ArgumentParser()
    
    # Paths
    parser.add_argument('--image-dir', type=str, default='../validation200_224_224/', help='path to foler of training images')
    parser.add_argument('--trained-model-dir', type=str, default='../trained_models/', help='path to trained models folder')
    parser.add_argument('--trained-model-fn', type=str, default='model_224x224_fold5', help='trained model filename')
    parser.add_argument('--result-dir', type=str, default='../results/', help='directory to store registration results')
    # Optimization parameters 
    parser.add_argument('--gpu-id', type=int, default=0, help='which GPU to use')
    # Model parameters
    parser.add_argument('--feature-cnn', type=str, default='vgg16', help='Feature extraction network: vgg16/resnet101')
    parser.add_argument('--image-size', type=int, default=224, help='size of image used for training and testing')
    
    return parser
   
   
def main():

    parser = build_parser()
    args = parser.parse_args()
    
    devices = tf.config.experimental.list_physical_devices('GPU')
    print(devices)
    for device in devices:
        tf.config.experimental.set_memory_growth(device, True)
    tf.config.experimental.set_visible_devices(devices[args.gpu_id], 'GPU')
    
    image_names = glob.glob(args.image_dir + '*.png')
    num_of_test_images = len(image_names)
    
    model = tf.keras.models.load_model(args.trained_model_dir + args.trained_model_fn + '.h5')
        
    g_stage_test_predicted_array = np.zeros((num_of_test_images,1))
        
    for i in range(0,num_of_test_images):
        test_image = cv2.imread(image_names[i])
        test_image = test_image.astype('float32')
        test_image = test_image.reshape((1,test_image.shape[0],test_image.shape[1],test_image.shape[2]))
        g_stage_test_predicted_array[i] = model(test_image)

    print('done!')
    
    image_names = np.array(image_names)
    image_names = image_names.reshape((image_names.shape[0],1))
    
    np.savetxt(args.result_dir + 'external_validation_dataset2_fold5.csv', np.concatenate((image_names,g_stage_test_predicted_array),axis=1), delimiter="," ,fmt='%s')
    
if __name__ == '__main__':
    main()