PSP - Calculation of residues and areas of interchain contacts in protein complexes (SASA)

Welcome to follow my CSDN:https://spike.blog.csdn.net/
This article address:https://spike.blog.csdn.net/article/details/134884889

In protein complexes, the residues on the contact surface and the area of ​​the contact surface can be calculated through the inter-chain distance, using the BioPython library and the SASA (Solvent Accessible Surface Area, Solvent Accessible Surface Area) library. SASA calculation is mainly based on whether water molecules pass through. The radius of water molecules is 1.4 A and the diameter is 2.8 A. Therefore, the distance of 2.8 A is used to determine whether the chains are connected.

related functions:

  1. Get the chain list of protein complex, i.e.get_chain_ids()
  2. Obtain the residue-atom dictionary. Note that the atom name may be different from the name of the PDB, that isget_atom_list()
  3. Get the all-atom list, that isget_all_atoms()
  4. Determine whether the atoms are connected, that is,is_contact()
  5. Get all the contact surfaces and calculate the residue dimensions atomically, that isget_contacts()
  6. Print the display command of ChimeraX, that isget_chimerax_cmd()
  7. End-to-end calculated contact residues, i.e. cal_chain_interface_residue()
  8. Calculate the SASA area, i.e.cal_sasa()
  9. Obtain the substructure of the complex multi-chain, i.e.get_sub_structure()
  10. End-to-end calculated contact area, that is, cal_chain_interface_area()

Taking PDB 8j18 as an example, calculate the contact residues and areas between chain A and chain R:

WITH

The source code is as follows:

#!/usr/bin/env python
# -- coding: utf-8 --
"""
Copyright (c) 2022. All rights reserved.
Created by C. L. Wang on 2023/12/8
"""

import os

from Bio.PDB import NeighborSearch
from Bio.PDB import PDBParser, Selection
from Bio.PDB.Chain import Chain
from Bio.PDB.Model import Model
from Bio.PDB.Structure import Structure

from root_dir import DATA_DIR


class MainComplexChainDist(object):
    """
    复合物,多链距离计算
    """
    def __init__(self):
        pass

    @staticmethod
    def get_chain_ids(structure):
        chain_ids = []
        for chain in structure.get_chains():
            chain_ids.append(chain.id)
        return chain_ids

    @staticmethod
    def get_atom_list(structure, chains):
        """
        获取 atom list 列表
        注意: 输出的 atom 名字与 PDB 文件中的 atom 名字略有差异,例如
        8jtl 第1个,PRO: CG 对应 OG1,CD 对应 CG2
        8jtl 第2个,VAL: CG1 对应 CD1,CG2 对应 CD2
        """
        output = dict()
        for chain in structure:
            if chain.id in chains:
                for residue in chain.get_residues():
                    het_flag, res_seq, i_code = residue.get_id()
                    the_id = (chain.id + "_" + str(res_seq) + "_" + i_code).strip()
                    for atom in residue.get_unpacked_list():
                        if het_flag == ' ':
                            if the_id in output:
                                output[the_id].append(atom)
                            else:
                                output[the_id] = [atom]
        return output

    # Filter out all the atoms from the chain,residue map given by residue_map
    @staticmethod
    def get_all_atoms(residue_map):
        all_atoms_out = []
        for residue in residue_map:
            for atom in residue_map[residue]:
                all_atoms_out.append(atom)
                # Set the b-factor to zero for coloring by contacts
                atom.set_bfactor(0.0)
        return all_atoms_out

    @staticmethod
    def is_contact(res_1, other_atoms, cutoff):
        for atom in res_1:
            ns = NeighborSearch(other_atoms)
            center = atom.get_coord()
            neighbors = ns.search(center, cutoff)  # 5.0 for distance in angstrom
            residue_list = Selection.unfold_entities(neighbors, 'R')  # R for residues
            if len(residue_list) > 0:
                return True
        return False

    @classmethod
    def get_contacts(cls, struc, all_atoms, verbose, cutoff):
        progress = 0
        contacts = []
        for residue in struc:
            progress += 1
            # if len(verbose) > 0:
            #     print(verbose, progress, "out of", len(struc))
            atom_list = struc[residue]
            outcome = cls.is_contact(atom_list, all_atoms, cutoff)
            if outcome:
                contacts.append(residue)
        return contacts

    @staticmethod
    def get_chimerax_cmd(contacts, chain_id, pdb_id="2"):
        """
        计算命令
        """
        tmp_list = []
        for item in contacts:
            req_id = int(item.split("_")[1])
            tmp_list.append(req_id)
        contacts_gap = []
        prev, post = -1, -1
        for i in tmp_list:
            if prev == -1:
                prev = post = i
                continue
            if i - post == 1:
                post = i
            else:
                contacts_gap.append(f"#{
      
      pdb_id}/{
      
      chain_id}:{
      
      prev}-{
      
      post}")
                prev = post = i
        cmd = "select " + " ".join(contacts_gap)
        return cmd

    @classmethod
    def cal_chain_interface_residue(cls, structure, chain1, chain2, cutoff=7.5):
        """
        计算复合物结构的链内距离
        """

        chain_ids = cls.get_chain_ids(structure)
        # ['A', 'B', 'G', 'R', 'S']
        print(f"[Info] chain_ids: {
      
      chain_ids}")
        assert chain1 in chain_ids and chain2 in chain_ids

        # C 表示 chain,即将 structure 展开 chain 的形式
        # 同时支持 A, R, C, M, S
        structure_c = Selection.unfold_entities(structure, target_level="C")
        atom_dict1 = cls.get_atom_list(structure_c, chains=[chain1])
        atom_dict2 = cls.get_atom_list(structure_c, chains=[chain2])
        print(f"[Info] res_num: {
      
      len(atom_dict1.keys())}")

        all_atoms_1 = cls.get_all_atoms(atom_dict1)
        all_atoms_2 = cls.get_all_atoms(atom_dict2)
        cutoff = cutoff
        contacts_1 = cls.get_contacts(atom_dict1, all_atoms_2, "First molecule, residue ", cutoff)
        contacts_2 = cls.get_contacts(atom_dict2, all_atoms_1, "Second molecule, residue ", cutoff)
        print(f"[Info] contacts_1: {
      
      len(contacts_1)}")
        print(f"[Info] contacts_2: {
      
      len(contacts_2)}")
        return contacts_1, contacts_2

    @staticmethod
    def cal_sasa(structure):
        """
        计算单体的表面积
        """
        import freesasa
        structure = freesasa.structureFromBioPDB(structure)
        result = freesasa.calc(structure)
        # classArea = freesasa.classifyResults(result, structure)
        return result.totalArea()

    @staticmethod
    def get_sub_structure(structure, chain_list):
        """
        获取 PDB 结构的子结构
        """
        new_structure = Structure(0)
        new_model = Model(0)
        # 遍历模型中的所有链
        for chain in structure[0]:
            # 获取链的ID
            tmp_chain_id = chain.get_id()
            if tmp_chain_id not in chain_list:
                continue
            # 创建一个新的结构对象,只包含当前链

            new_chain = Chain(tmp_chain_id)
            new_chain.child_list = chain.child_list
            new_model.add(new_chain)
        new_structure.add(new_model)
        return new_structure

    @classmethod
    def cal_chain_interface_area(cls, structure, chain1, chain2):
        sub_all_structure = cls.get_sub_structure(structure, [chain1, chain2])
        sub1_structure = cls.get_sub_structure(structure, [chain1])
        sub2_structure = cls.get_sub_structure(structure, [chain2])
        v1_area = cls.cal_sasa(sub_all_structure)
        v2_area = cls.cal_sasa(sub1_structure) + cls.cal_sasa(sub2_structure)
        inter_area = max(int((v2_area - v1_area) // 2), 0)
        return inter_area

    def process(self, input_path, chain1, chain2):
        """
        核心逻辑
        """
        print(f"[Info] input_path: {
      
      input_path}")
        print(f"[Info] chain1: {
      
      chain1}, chain2: {
      
      chain2}")
        chain1 = chain1.upper()
        chain2 = chain2.upper()
        parser = PDBParser(QUIET=True)
        structure = parser.get_structure("def", input_path)
        contacts_1, contacts_2 = self.cal_chain_interface_residue(structure, chain1, chain2, cutoff=7.5)
        cmd1 = self.get_chimerax_cmd(contacts_1, chain1)
        cmd2 = self.get_chimerax_cmd(contacts_2, chain2)
        print(f"[Info] chimerax: {
      
      cmd1}")
        print(f"[Info] chimerax: {
      
      cmd2}")
        inter_area = self.cal_chain_interface_area(structure, chain1, chain2)
        print(f"[Info] inter_area: {
      
      inter_area}")


def main():
    ccd = MainComplexChainDist()

    # input_path = os.path.join(DATA_DIR, "templates", "8p3l.pdb")
    # ccd.process(input_path, "A", "D")  # 2562
    # ccd.process(input_path, "A", "J")  # 490

    input_path = os.path.join(DATA_DIR, "templates", "8j18.pdb")
    # ccd.process(input_path, "R", "S")  # 0
    ccd.process(input_path, "A", "R")  # 1114


if __name__ == '__main__':
    main()

Guess you like

Origin blog.csdn.net/u012515223/article/details/134884889