Source code for SIDISH.ppi_network_handler

import pandas as pd
import numpy as np
import os

[docs] class PPINetworkHandler: """Handles PPI network construction and neighbor retrieval using fixed file paths.""" def __init__(self, adata): """ Initialize the PPI network handler. Parameters: adata: AnnData An AnnData object containing gene expression data. """ self.adata = adata self.ppi_df = None
[docs] def load_network(self, threshold=0.8): """ Load and process the PPI network from fixed files (integrating interactions from Hippie and STRING files). Fixed files used: - Hippie file: located at SIDISH/PPI/hippie_current.txt - STRING links file: located at SIDISH/PPI/9606.protein.links.v11.5.txt - STRING info file: located at SIDISH/PPI/9606.protein.info.v11.5.txt The method performs the following steps: 1. Builds a gene mapping from STRING info (only including genes present in the AnnData object). 2. Processes the Hippie file to extract interactions if the score is >= threshold. 3. Processes the STRING links file to extract interactions if the score is >= threshold * 1000. 4. Merges the interactions from both sources into one DataFrame. 5. Constructs a merged network dictionary (with normalized scores) and saves it as a NumPy file. 6. Returns the merged interactions as a pandas DataFrame. Parameters: threshold: float, optional (default=0.8) Threshold for filtering interactions. Returns: pd.DataFrame: A DataFrame containing the merged PPI network with columns: "Source", "Target", and "Weight". """ # Compute paths relative to this file's directory: base_dir = os.path.dirname('../data/') ppi_dir = os.path.join(base_dir, "PPI") info_file = os.path.join(ppi_dir, "9606.protein.info.v11.5.txt") links_file = os.path.join(ppi_dir, "9606.protein.links.v11.5.txt") hippie_file = os.path.join(ppi_dir, "hippie_current.txt") # Get the set of genes from the AnnData object. adatagene = set(self.adata.var.index.values) # --- Build gene mapping from STRING info file --- with open(info_file, "r") as f: lines = f.readlines()[1:] # Skip header gene_map = { line.split("\t")[0]: line.split("\t")[1].strip() for line in lines if line.split("\t")[1].strip() in adatagene } # --- Process Hippie file --- newhippie = [] with open(hippie_file, "r") as f: hippie_lines = f.readlines() for line in hippie_lines[1:]: parts = line.split("\t") if len(parts) < 5: continue A = parts[0].split("_")[0] B = parts[2].split("_")[0] try: score_value = float(parts[4]) except ValueError: continue if A in adatagene and B in adatagene and score_value >= threshold: score_int = int(score_value * 1000) # scale to match STRING file scores newhippie.append([A, B, score_int]) newhippie.append([B, A, score_int]) # --- Process STRING links file --- newstring = [] with open(links_file, "r") as f: string_lines = f.readlines()[1:] # Skip header for line in string_lines: parts = line.strip().split() if len(parts) < 3: continue try: score = int(parts[2].strip("\n")) except ValueError: continue if score >= threshold * 1000: if parts[0] in gene_map and parts[1] in gene_map: gene_source = gene_map[parts[0]] gene_target = gene_map[parts[1]] newstring.append([gene_source, gene_target, score]) # --- Merge interactions from Hippie and STRING sources --- merged_interactions = newstring + newhippie df = pd.DataFrame(merged_interactions, columns=["Source", "Target", "Weight"]) df = df.drop_duplicates() # --- Build and save the merged network dictionary --- merged_dict = {} for _, row in df.iterrows(): src = row["Source"] tgt = row["Target"] weight = row["Weight"] if src not in merged_dict: merged_dict[src] = {} merged_dict[src][tgt] = weight / 1000.0 # Normalize weight self.ppi_df = df return self.ppi_df
[docs] def get_neighbors(self, target_gene): """ Retrieve direct and indirect neighbors of a target gene in the PPI network. Parameters: target_gene: str The gene for which to retrieve neighbors. Returns: tuple: A tuple containing: - list of direct neighbors - list of indirect neighbors """ # Direct neighbors: genes connected directly to the target. direct_neighbors = set( self.ppi_df.loc[self.ppi_df["Source"] == target_gene, "Target"] ).union( self.ppi_df.loc[self.ppi_df["Target"] == target_gene, "Source"] ) # Indirect neighbors: neighbors of direct neighbors, excluding direct ones and the target gene. indirect_neighbors = { neighbor2 for neighbor in direct_neighbors for neighbor2 in self.ppi_df.loc[ (self.ppi_df["Source"] == neighbor) | (self.ppi_df["Target"] == neighbor), ["Source", "Target"] ].values.flatten() if neighbor2 not in direct_neighbors and neighbor2 != target_gene } return list(direct_neighbors), list(indirect_neighbors)