Source code for SIDISH.ppi_network_handler

import pandas as pd
import numpy as np
import os


[docs]
class PPINetworkHandler:
    """Handles PPI network construction and neighbor retrieval using fixed file paths."""

    def __init__(self, adata):
        """
        Initialize the PPI network handler.

        Parameters:
            adata: AnnData
                An AnnData object containing gene expression data.
        """
        self.adata = adata
        self.ppi_df = None


[docs]
    def load_network(self, threshold=0.8):
        """
        Load and process the PPI network from fixed files (integrating interactions from Hippie and STRING files).

        Fixed files used:
            - Hippie file: located at SIDISH/PPI/hippie_current.txt
            - STRING links file: located at SIDISH/PPI/9606.protein.links.v11.5.txt
            - STRING info file: located at SIDISH/PPI/9606.protein.info.v11.5.txt

        The method performs the following steps:
          1. Builds a gene mapping from STRING info (only including genes present in the AnnData object).
          2. Processes the Hippie file to extract interactions if the score is >= threshold.
          3. Processes the STRING links file to extract interactions if the score is >= threshold * 1000.
          4. Merges the interactions from both sources into one DataFrame.
          5. Constructs a merged network dictionary (with normalized scores) and saves it as a NumPy file.
          6. Returns the merged interactions as a pandas DataFrame.

        Parameters:
            threshold: float, optional (default=0.8)
                Threshold for filtering interactions.

        Returns:
            pd.DataFrame:
                A DataFrame containing the merged PPI network with columns:
                "Source", "Target", and "Weight".
        """
        # Compute paths relative to this file's directory:
        base_dir = os.path.dirname('../data/')
        ppi_dir = os.path.join(base_dir, "PPI")
        info_file = os.path.join(ppi_dir, "9606.protein.info.v11.5.txt")
        links_file = os.path.join(ppi_dir, "9606.protein.links.v11.5.txt")
        hippie_file = os.path.join(ppi_dir, "hippie_current.txt")
        
        # Get the set of genes from the AnnData object.
        adatagene = set(self.adata.var.index.values)

        # --- Build gene mapping from STRING info file ---
        with open(info_file, "r") as f:
            lines = f.readlines()[1:]  # Skip header
            gene_map = {
                line.split("\t")[0]: line.split("\t")[1].strip()
                for line in lines
                if line.split("\t")[1].strip() in adatagene
            }

        # --- Process Hippie file ---
        newhippie = []
        with open(hippie_file, "r") as f:
            hippie_lines = f.readlines()
        for line in hippie_lines[1:]:
            parts = line.split("\t")
            if len(parts) < 5:
                continue
            A = parts[0].split("_")[0]
            B = parts[2].split("_")[0]
            try:
                score_value = float(parts[4])
            except ValueError:
                continue
            if A in adatagene and B in adatagene and score_value >= threshold:
                score_int = int(score_value * 1000)  # scale to match STRING file scores
                newhippie.append([A, B, score_int])
                newhippie.append([B, A, score_int])

        # --- Process STRING links file ---
        newstring = []
        with open(links_file, "r") as f:
            string_lines = f.readlines()[1:]  # Skip header
        for line in string_lines:
            parts = line.strip().split()
            if len(parts) < 3:
                continue
            try:
                score = int(parts[2].strip("\n"))
            except ValueError:
                continue
            if score >= threshold * 1000:
                if parts[0] in gene_map and parts[1] in gene_map:
                    gene_source = gene_map[parts[0]]
                    gene_target = gene_map[parts[1]]
                    newstring.append([gene_source, gene_target, score])

        # --- Merge interactions from Hippie and STRING sources ---
        merged_interactions = newstring + newhippie
        df = pd.DataFrame(merged_interactions, columns=["Source", "Target", "Weight"])
        df = df.drop_duplicates()

        # --- Build and save the merged network dictionary ---
        merged_dict = {}
        for _, row in df.iterrows():
            src = row["Source"]
            tgt = row["Target"]
            weight = row["Weight"]
            if src not in merged_dict:
                merged_dict[src] = {}
            merged_dict[src][tgt] = weight / 1000.0  # Normalize weight

        self.ppi_df = df
        return self.ppi_df




[docs]
    def get_neighbors(self, target_gene):
        """
        Retrieve direct and indirect neighbors of a target gene in the PPI network.

        Parameters:
            target_gene: str
                The gene for which to retrieve neighbors.

        Returns:
            tuple:
                A tuple containing:
                  - list of direct neighbors
                  - list of indirect neighbors
        """
        # Direct neighbors: genes connected directly to the target.
        direct_neighbors = set(
            self.ppi_df.loc[self.ppi_df["Source"] == target_gene, "Target"]
        ).union(
            self.ppi_df.loc[self.ppi_df["Target"] == target_gene, "Source"]
        )

        # Indirect neighbors: neighbors of direct neighbors, excluding direct ones and the target gene.
        indirect_neighbors = {
            neighbor2
            for neighbor in direct_neighbors
            for neighbor2 in self.ppi_df.loc[
                (self.ppi_df["Source"] == neighbor) | (self.ppi_df["Target"] == neighbor),
                ["Source", "Target"]
            ].values.flatten()
            if neighbor2 not in direct_neighbors and neighbor2 != target_gene
        }

        return list(direct_neighbors), list(indirect_neighbors)