Data Mining KDD field -CELF has become a classic of the social impact of maximizing network discovery algorithm , for improved greedy algorithm 's efficiency ( lifting 700% ). Get KDD 2019 's classic paper award , author: Jure Leskovec, paper: Cost-as Effective Outbreak Detection in Networks (2007).
CELF algorithm is based on the influence of features having submodularity raised, i.e., the influence of all the nodes in the node set as the number of nodes increased seed weakening, having monotonically decreasing resistance.
import matplotlib.pyplot as plt
from random import uniform, seed
import numpy as np
import time
from igraph import *
def IC(g,S,p=0.5,mc=1000):
"""
Input: graph object, set of seed nodes, propagation probability
and the number of Monte-Carlo simulations
Output: average number of nodes influenced by the seed nodes
"""
# Loop over the Monte-Carlo Simulations
spread = []
for i in range(mc):
# Simulate propagation process
new_active, A = S[:], S[:]
while new_active:
# For each newly active node, find its neighbors that become activated
new_ones = []
for node in new_active:
# Determine neighbors that become infected
np.random.seed(i)
success = np.random.uniform(0,1,len(g.neighbors(node,mode="out"))) < p
new_ones += list(np.extract(success, g.neighbors(node,mode="out")))
new_active = list(set(new_ones) - set(A))
# Add newly activated nodes to the set of activated nodes
A += new_active
spread.append(len(A))
return(np.mean(spread))
def celf(g,k,p=0.1,mc=1000):
"""
Input: graph object, number of seed nodes
Output: optimal seed set, resulting spread, time for each iteration
"""
# --------------------
# Find the first node with greedy algorithm
# --------------------
# Calculate the first iteration sorted list
start_time = time.time()
marg_gain = [IC(g,[node],p,mc) for node in range(g.vcount())]
# Create the sorted list of nodes and their marginal gain
Q = sorted(zip(range(g.vcount()),marg_gain), key=lambda x: x[1],reverse=True)
# Select the first node and remove from candidate list
S, spread, SPREAD = [Q[0][0]], Q[0][1], [Q[0][1]]
Q, LOOKUPS, timelapse = Q[1:], [g.vcount()], [time.time()-start_time]
# --------------------
# Find the next k-1 nodes using the list-sorting procedure
# --------------------
for _ in range(k-1):
check, node_lookup = False, 0
while not check:
# Count the number of times the spread is computed
node_lookup += 1
# Recalculate spread of top node
current = Q[0][0]
# Evaluate the spread function and store the marginal gain in the list
Q[0] = (current,IC(g,S+[current],p,mc) - spread)
# Re-sort the list
Q = sorted(Q, key = lambda x: x[1], reverse = True)
# Check if previous top node stayed on top after the sort
check = (Q[0][0] == current)
# Select the next node
spread += Q[0][1]
S.append(Q[0][0])
SPREAD.append(spread)
LOOKUPS.append(node_lookup)
timelapse.append(time.time() - start_time)
# Remove the selected node from the list
Q = Q[1:]
return(S,SPREAD,timelapse,LOOKUPS)
# Create simple network with 0 and 1 as the influential nodes
source = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,3,4,5]
target = [2,3,4,5,6,7,8,9,2,3,4,5,6,7,8,9,6,7,8,9]
g = Graph(directed=True)
g.add_vertices(range(10))
g.add_edges(zip(source,target))
# Plot graph
g.vs["label"], g.es["color"], g.vs["color"] = range(10), "#B3CDE3", "#FBB4AE"
plot(g,bbox = (200,200),margin = 20,layout = g.layout("kk"))
# Run algorithms
celf_output = celf(g,2,p = 0.2,mc = 1000)
# Print results
print("celf output: " + str(celf_output[0]))
Run Results: celf output: [0, 1]
Source Code: https://hautahi.com/im_greedycelf