# import necessary libraries
import numpy as np
from scipy import sparse
from numpy.linalg import norm
import time
np.seterr(divide='ignore')
# method to parse the data. The input file contains
# in compressed form the connectivity matrix for the webpages
# of Stanford University. Obviously, storing all that information
# naively in a numpy array would lead to memory error. Thus, sparse
# matrix is used.
def load_data(data):
# list to store the origin pages,
# i.e. the pages showing to other pages
from_pages = []
# list to store the destination pages,
# i.e. the pages being showed by other pages
to_pages = []
# dictionary having as a key a webpage and as value
# the number of its invoming edges
dict_out = {}
dict_in = {}
with open(data, 'r') as f:
# for each line of the initial data file
for line in f.readlines():
# get the item of the first column
# and substract one. This is done because the ids of webpages
# begin from 1 but the indexes for matrices begine from 0. So
# 1 will stand for 0.
from_page = int(line.split()[0]) - 1
# Apply the same logic as above for the element of the second
# column.
to_page = int(line.split()[1]) - 1
# append the the from page to the relative list
from_pages.append(from_page)
# append the the to page to the relative list
to_pages.append(to_page)
# fill the dictionary by counting how many times
# each destination-page has appeared.
if from_page not in dict_out.keys():
dict_out[from_page] = 1
else:
dict_out[from_page] += 1
if to_page not in dict_in.keys():
dict_in[to_page] = [from_page]
else:
dict_in[to_page].append(from_page)
# compute number of edges
edges = len(from_pages)
# compute number of nodes
nodes = len(set(from_pages) | set(to_pages))
for node in range(nodes):
if node not in dict_in.keys():
dict_in[node] = []
# initialize a row-based linked list sparse matrix
# that matrix will be used for the Gauss-Seidel implementation
# of the PageRank
D = sparse.lil_matrix((nodes, 1))
for key in dict_out.keys():
D[key] = 1/dict_out[key]
# fill the connectivity matrix, adjacency matrix. A sparse structure
# is used. That way the implementation is extremely memory efficient.
csr_m = sparse.csr_matrix(([1]*edges,
(to_pages, from_pages)),
shape=(nodes, nodes))
# return the two created matrices
return csr_m, D, dict_in
def pagerank_galgo(G, alpha=0.85, tol=10**-8):
# The following implementation is based on: http://infolab.stanford.edu/~ullman/mmds/ch5.pdf.
# Additionaly, the implementation is based on the lectures of the coures "Data Mining" of
# the professor Yannis Kotidis (AUEB).
# The matrix G is the adjacency matrix with dead-ends and closed communities or spidertraps.
# Those problems will be tackled during the iterations of the algorithm. In other words, we do
# not pre-adjust the matrix P. Instead we reinsert leaked page-rank back into computation.
# compute the number of nodes, the matrix is symmetric
n = G.shape[0]
# compute the ratio of the out degree of each node
# to the alpha
out_alpha = G.sum(axis=0).T/alpha
r = np.ones((n,1))/n
error = 1
t = 0
while error > tol:
t += 1
r_new = G.dot((r/out_alpha))
# L≤alpha due to dead-ends
L = r_new.sum()
# re-insert leaked page-rank
# now all ranks add to one
r_new += (1-L)/n
# compute the error as the euclidean norm of the
# previous ranks and the new ranks
error = np.linalg.norm(r-r_new)
# store the new ranks as the ranks of the current
# iteration
if t == 2:
# list to return the nodes that converged from the
# second iteration
list_conv = []
for i in range(r.shape[0]):
if np.linalg.norm(r[i]-r_new[i]) < tol:
list_conv.append(i)
r = r_new
return r, t, list_conv
def pagerank_gs(tol, G, D, dict_in, alpha):
# in that approach a linear system approach is used to solve
# get the ranks of each webpage
# the initial solution is again given from the uniform distribution
# i.e. we give equal weights to each webpage
x_old = np.ones(G.shape[0])/G.shape[0]
t = 0
error = 1
b = 1/G.shape[0]
D = alpha*D
x = x_old.copy()
# apply the element-wise formula for the Gauss–Seidel method for
# the pagerank. That formula looks a lot like the Jacobi formula
# and it is the same with SOR for omega equals to 1. As input it
# is used the initial matrix P without any preadjustments as well
# as a dictionary showing all the in-edged of each webpage. In general,
# the following implementation scheme is based on the paper:
# http://www.ece.ucsb.edu/~hespanha/published/2018ACC_0753_MS.pdf
while error > tol:
t += 1
for i in range(G.shape[0]):
sum_before = 0
sum_after = 0
for j in dict_in[i]:
if j < i:
sum_before += D.data[j][0] * x[j]
if j > i:
sum_after += D.data[j][0] * x_old[j]
x[i] = b + sum_before + sum_after
error = np.linalg.norm(x-x_old)/np.linalg.norm(x)
if t == 2:
# list to return the nodes that converged from the
# second iteration
list_conv = []
for i in range(x.shape[0]):
if np.linalg.norm(x[i]-x_old[i]) < tol:
list_conv.append(i)
print('Iteration:', t, '==> Relative Error:', error)
x_old = x.copy()
return x, list_conv
csr_m, D, dict_in = load_data("stanweb.dat")
start_time = time.time()
pr, t, list_conv = pagerank_galgo(csr_m)
print("Time of Pagerank with Google's Algorithm:",
time.time() - start_time, "seconds")
print ("Number of iterations:", t)
mat_prg = np.asarray(pr).ravel()
start_time = time.time()
gs, list_convgs = pagerank_gs(10**-8, csr_m, D, dict_in, 0.85)
print("Time of Pagerank with Gauss-Seidel:",
time.time() - start_time, "seconds")
mat_prgs = np.asarray(gs).ravel()
The Power method seems to be much faster than the version with the Gauss-Seidel. The results are not the same for both methods. They differ in 121562 ranks.
indices_pr = mat_prg.argsort()[-len(mat_prg):][::-1]
indices_prs = mat_prgs.argsort()[-len(mat_prgs):][::-1]
print("The two aforementioned methods differ in:",
np.sum(indices_prs[:] != indices_pr[:]), "ranks.")
start_time = time.time()
pr_99, t, list_conv99 = pagerank_galgo(csr_m, 0.99, 10**-8)
print("Time of Pagerank with Google's Algorithm:",
time.time() - start_time, "seconds")
print ("Number of iterations:", t)
mat_prg_99 = np.asarray(pr_99).ravel()
start_time = time.time()
gs_99, list_convgs99 = pagerank_gs(10**-8, csr_m, D, dict_in, 0.99)
print("Time of Pagerank with Gauss-Seidel:",
time.time() - start_time, "seconds")
mat_prgs_99 = np.asarray(gs_99).ravel()
We observe that both methods converge slower for alpha=0.99. The Gauss Seidel method for the iterative solution of the system converges much slower that time. Again, the power method seems to be much faster than the version with the Gauss-Seidel.Tthe ranking of the first 50 nodes changed. Apart from the fist most important node, which remains 89072, all the others changed position.
indices_pr_99 = mat_prg_99.argsort()[-len(mat_prg):][::-1]
indices_prs_99 = mat_prgs_99.argsort()[-len(mat_prg):][::-1]
print("The two aforementioned methods differ in:",
sum(indices_prs_99[:] != indices_pr_99[:]), "ranks.")
print("We observe differences in:",
sum(indices_pr[:50] != indices_pr_99[:50]),
"ranks (power method).")
print("We observe differences in:",
sum(indices_prs[:50] != indices_prs_99[:50]),
"ranks (Gauss Seidel).")
We observe that the components of π that correspond to non important converge faster, as it is shown below.
print("When we use the power method",
len(list_conv),
"components of π converge at the second iteration.")
counter = 0
for node in list_conv:
if node in indices_pr[:1000]:
counter += 1
print("From those components of π that converged faster",
counter,
"component(s) correspond to the top (1000) ranked nodes (Power method).")
print("When we find π through the solution of the linear system",
len(list_convgs),
"components of π converge at the second iteration.")
counter = 0
for node in list_convgs:
if node in indices_prs[:1000]:
counter += 1
print("From those components of π that converged faster", counter,
"component(s) correspond to the top (1000) ranked nodes (Gauss Seidel).")
# method to parse the data. The input file contains
# in compressed form the connectivity matrix for the webpages
# of Stanford University. Obviously, storing all that information
# naively in a numpy array would lead to memory error. Thus, sparse
# matrix is used.
def load_data1(data):
# list to store the origin pages,
# i.e. the pages showing to other pages
from_pages = []
# list to store the destination pages,
# i.e. the pages being showed by other pages
to_pages = []
with open(data, 'r') as f:
# for each line of the initial data file
for line in f.readlines():
# get the item of the first column
# and substract one. This is done because the ids of webpages
# begin from 1 but the indexes for matrices begine from 0. So
# 1 will stand for 0.
from_page = int(line.split()[0]) - 1
# Apply the same logic as above for the element of the second
# column.
to_page = int(line.split()[1]) - 1
# append the the from page to the relative list
from_pages.append(from_page)
# append the the to page to the relative list
to_pages.append(to_page)
# compute number of edges
edges = len(from_pages)
# compute number of nodes
nodes = len(set(from_pages) | set(to_pages))
# fill the connectivity matrix, adjacency matrix. A sparse structure
# is used. That way the implementation is extremely memory efficient.
# increase the size of the array by introducing a dangling node, i.e.
# X
csr_m = sparse.csr_matrix(([1]*edges,
(to_pages, from_pages)),
shape=(nodes + 1, nodes + 1))
# return the adjacency sparse matrix
return csr_m
csr_m1 = load_data1("stanweb.dat")
pr1, t1, list_conv1 = pagerank_galgo(csr_m1)
We observe that the PageRanks of the older pages due to the addition of the new page remain almost unchanged, as they are not connected at all to the newly added page. Additionally, we should also bear in mind that the number of nodes is a very large number. The PageRank of the newly added page is expected from the formula the PageRank is calculated. It is basically only the part (1 - alpha)/n = 0.15/281904.
pageRank_X = np.asarray(pr1[csr_m1.shape[0] - 1]).ravel()[0]
print("The PageRank of page X is:", pageRank_X)
norm_diff = np.linalg.norm(np.asarray(pr[:csr_m.shape[0]].ravel()) - np.asarray(pr1[:csr_m1.shape[0] - 1].ravel()))
print("Change in PageRank of the other pages due to the addition of the new page:", norm_diff)
mat_prg = np.asarray(pr).ravel()
mat_prg1 = np.asarray(pr1[:csr_m1.shape[0] - 1]).ravel()
indices_pr = mat_prg.argsort()[-len(mat_prg):][::-1]
indices_pr1 = mat_prg1.argsort()[-len(mat_prg1):][::-1]
print("Changes in:",
np.sum(indices_pr[:] != indices_pr1[:]), "ranks.")
# method to parse the data. The input file contains
# in compressed form the connectivity matrix for the webpages
# of Stanford University. Obviously, storing all that information
# naively in a numpy array would lead to memory error. Thus, sparse
# matrix is used.
def load_data2(data):
# list to store the origin pages,
# i.e. the pages showing to other pages
from_pages = []
# list to store the destination pages,
# i.e. the pages being showed by other pages
to_pages = []
with open(data, 'r') as f:
# for each line of the initial data file
for line in f.readlines():
# get the item of the first column
# and substract one. This is done because the ids of webpages
# begin from 1 but the indexes for matrices begine from 0. So
# 1 will stand for 0.
from_page = int(line.split()[0]) - 1
# Apply the same logic as above for the element of the second
# column.
to_page = int(line.split()[1]) - 1
# append the the from page to the relative list
from_pages.append(from_page)
# append the the to page to the relative list
to_pages.append(to_page)
# compute number of edges
edges = len(from_pages)
# compute number of nodes
nodes = len(set(from_pages) | set(to_pages))
# add a new edge Y -> X
from_pages.append(nodes)
to_pages.append(nodes + 1)
# increase the number of nodes by two for X, Y
nodes += 2
# increase the number of edges by one
edges += 1
# fill the connectivity matrix, adjacency matrix. A sparse structure
# is used. That way the implementation is extremely memory efficient.
csr_m = sparse.csr_matrix(([1]*edges,
(to_pages, from_pages)),
shape=(nodes, nodes))
# return the adjacency sparse matrix
return csr_m
csr_m2 = load_data2("stanweb.dat")
pr2, t2, list_conv2 = pagerank_galgo(csr_m2)
We observe that the PageRanks of the older pages changed more that time due to the addition of the new pages. Additionally, we should also bear in mind that the number of nodes is a very large number. The PageRank of the newly added page is expected from the formula the PageRank is calculated. The PageRank of X is obviously improved while the PageRank of Y is the same as page X in the previous question.
pageRank_Xb = np.asarray(pr2[csr_m2.shape[0] - 1]).ravel()[0]
print("The PageRank of page X is:", pageRank_Xb)
pageRank_Yb = np.asarray(pr2[csr_m2.shape[0] - 2]).ravel()[0]
print("The PageRank of page Y is:", pageRank_Yb)
norm_diff = np.linalg.norm(np.asarray(pr[:csr_m.shape[0]].ravel()) - np.asarray(pr2[:csr_m2.shape[0] - 2].ravel()))
print("Change in PageRank of the other pages due to the addition of the new page:", norm_diff)
mat_prg = np.asarray(pr).ravel()
mat_prg2 = np.asarray(pr2[:csr_m2.shape[0] - 2]).ravel()
indices_pr = mat_prg.argsort()[-len(mat_prg):][::-1]
indices_pr2 = mat_prg2.argsort()[-len(mat_prg2):][::-1]
print("Changes in:",
np.sum(indices_pr[:] != indices_pr2[:]), "ranks.")
# method to parse the data. The input file contains
# in compressed form the connectivity matrix for the webpages
# of Stanford University. Obviously, storing all that information
# naively in a numpy array would lead to memory error. Thus, sparse
# matrix is used.
def load_data3(data):
# list to store the origin pages,
# i.e. the pages showing to other pages
from_pages = []
# list to store the destination pages,
# i.e. the pages being showed by other pages
to_pages = []
with open(data, 'r') as f:
# for each line of the initial data file
for line in f.readlines():
# get the item of the first column
# and substract one. This is done because the ids of webpages
# begin from 1 but the indexes for matrices begine from 0. So
# 1 will stand for 0.
from_page = int(line.split()[0]) - 1
# Apply the same logic as above for the element of the second
# column.
to_page = int(line.split()[1]) - 1
# append the the from page to the relative list
from_pages.append(from_page)
# append the the to page to the relative list
to_pages.append(to_page)
# compute number of edges
edges = len(from_pages)
# compute number of nodes
nodes = len(set(from_pages) | set(to_pages))
# add a new edge Y -> X
from_pages.append(nodes)
to_pages.append(nodes + 2)
# add a new edge Z -> X
from_pages.append(nodes + 1)
to_pages.append(nodes + 2)
# increases as needed the number of nodes and edges
nodes += 3
edges += 2
# fill the connectivity matrix, adjacency matrix. A sparse structure
# is used. That way the implementation is extremely memory efficient.
csr_m = sparse.csr_matrix(([1]*edges,
(to_pages, from_pages)),
shape=(nodes, nodes))
# return the adjacency sparse matrix
return csr_m
csr_m3 = load_data3("stanweb.dat")
pr3, t3, list_conv3 = pagerank_galgo(csr_m3)
In order to to maximize the PageRank of X, both Y and Z should point only to X, according to logic behind PageRank. This is implemented and the results are presented below. Obviously, the PageRank of X is now again improved.
pageRank_Xc = np.asarray(pr3[csr_m3.shape[0] - 1]).ravel()[0]
print("The PageRank of page X is:", pageRank_Xc)
pageRank_Yc = np.asarray(pr3[csr_m3.shape[0] - 2]).ravel()[0]
print("The PageRank of page Y is:", pageRank_Yc)
pageRank_Zc = np.asarray(pr3[csr_m3.shape[0] - 3]).ravel()[0]
print("The PageRank of page Z is:", pageRank_Zc)
norm_diff = np.linalg.norm(np.asarray(pr[:csr_m.shape[0]].ravel()) - np.asarray(pr3[:csr_m3.shape[0] - 3].ravel()))
print("Changes in PageRank of the other pages due to the addition of the new page:", norm_diff)
mat_prg = np.asarray(pr).ravel()
mat_prg3 = np.asarray(pr3[:csr_m3.shape[0] - 3]).ravel()
indices_pr = mat_prg.argsort()[-len(mat_prg):][::-1]
indices_pr3 = mat_prg3.argsort()[-len(mat_prg3):][::-1]
print("Changes in:",
np.sum(indices_pr[:] != indices_pr3[:]), "ranks.")
# determine top 100 - most popular pages - here as popular are considered
# pages with high PageRank, because PageRank can be seen as a measure of
# popularity
top_100 = indices_pr[0:100]
# determine top 100 - most popular pages - here as popular are considered
# pages with high in-degree
list_degree = np.asarray([len(dict_in[x]) for x in sorted(list(dict_in.keys()))])
indices_degree = list_degree.argsort()[-len(list_degree):][::-1]
popular_100 = list_degree[0:100]
# method to parse the data. The input file contains
# in compressed form the connectivity matrix for the webpages
# of Stanford University. Obviously, storing all that information
# naively in a numpy array would lead to memory error. Thus, sparse
# matrix is used.
def load_data4(data, top100):
# list to store the origin pages,
# i.e. the pages showing to other pages
from_pages = []
# list to store the destination pages,
# i.e. the pages being showed by other pages
to_pages = []
with open(data, 'r') as f:
# for each line of the initial data file
for line in f.readlines():
# get the item of the first column
# and substract one. This is done because the ids of webpages
# begin from 1 but the indexes for matrices begine from 0. So
# 1 will stand for 0.
from_page = int(line.split()[0]) - 1
# Apply the same logic as above for the element of the second
# column.
to_page = int(line.split()[1]) - 1
# append the the from page to the relative list
from_pages.append(from_page)
# append the the to page to the relative list
to_pages.append(to_page)
# compute number of edges
edges = len(from_pages)
# compute number of nodes
nodes = len(set(from_pages) | set(to_pages))
# add a new edge Y -> X
from_pages.append(nodes)
to_pages.append(nodes + 2)
# add a new edge Z -> X
from_pages.append(nodes + 1)
to_pages.append(nodes + 2)
# add links from X to older, popular pages
for node in top_100:
from_pages.append(nodes + 2)
to_pages.append(node)
# increase number of nodes and edges as needed
nodes += 3
edges += 102
# fill the connectivity matrix, adjacency matrix. A sparse structure
# is used. That way the implementation is extremely memory efficient.
csr_m = sparse.csr_matrix(([1]*edges,
(to_pages, from_pages)),
shape=(nodes, nodes))
# return the adjacency sparse matrix
return csr_m
# method to parse the data. The input file contains
# in compressed form the connectivity matrix for the webpages
# of Stanford University. Obviously, storing all that information
# naively in a numpy array would lead to memory error. Thus, sparse
# matrix is used.
def load_data5(data, top100):
# list to store the origin pages,
# i.e. the pages showing to other pages
from_pages = []
# list to store the destination pages,
# i.e. the pages being showed by other pages
to_pages = []
with open(data, 'r') as f:
# for each line of the initial data file
for line in f.readlines():
# get the item of the first column
# and substract one. This is done because the ids of webpages
# begin from 1 but the indexes for matrices begine from 0. So
# 1 will stand for 0.
from_page = int(line.split()[0]) - 1
# Apply the same logic as above for the element of the second
# column.
to_page = int(line.split()[1]) - 1
# append the the from page to the relative list
from_pages.append(from_page)
# append the the to page to the relative list
to_pages.append(to_page)
# compute number of edges
edges = len(from_pages)
# compute number of nodes
nodes = len(set(from_pages) | set(to_pages))
# add a new edge Y -> X
from_pages.append(nodes)
to_pages.append(nodes + 2)
# add a new edge Z -> X
from_pages.append(nodes + 1)
to_pages.append(nodes + 2)
# add links from Y (or Z) to older, popular pages
for node in top_100:
from_pages.append(nodes + 1)
to_pages.append(node)
# increase number of nodes and edges as needed
nodes += 3
edges += 102
# fill the connectivity matrix, adjacency matrix. A sparse structure
# is used. That way the implementation is extremely memory efficient.
csr_m = sparse.csr_matrix(([1]*edges,
(to_pages, from_pages)),
shape=(nodes, nodes))
# return the adjacency sparse matrix
return csr_m
csr_m4 = load_data4("stanweb.dat", top_100)
pr4, t4, list_conv4 = pagerank_galgo(csr_m4)
csr_m5 = load_data5("stanweb.dat", top_100)
pr5, t4, list_conv5 = pagerank_galgo(csr_m5)
We observe that if we add links from X to older, popular pages (top 100) the PageRank is slightly worse and if we add links from Y (or Z) to older, popular pages (top 100) the PageRank of X is even worse. This is rational as the PageRank of Y (or Z) is now split to more webpages. Consequently, this is not an effective way of increasing the PageRank of X, as in general the PageRank of a webpage is increasing as the webpages pointing to that webpage increases.
# if we add links from X to older, popular pages
pageRank_Xd = np.asarray(pr4[csr_m4.shape[0] - 1]).ravel()[0]
print("If we add links from X to older, popular pages the PageRank of page X is:", pageRank_Xd)
# if we add links from Y to older, popular pages
pageRank_Xd2 = np.asarray(pr5[csr_m5.shape[0] - 1]).ravel()[0]
print("If we add links from Y to older, popular pages the PageRank of page X is:", pageRank_Xd2)
Based on the previous questions it is obvious that increasing the PageRank of a webpage is a tough task. However, that does not mean it is also impossible. The PageRank of a webpage can be increased if more and more pages point to that to webpage. Of course, if those webpages have high PageRank then their contribution will be even more significant. Nevertheless, in general, the more pages pointing to a webpage, the higher the PageRank of that page. Moreover, interconnections amongst those pages could also be helpful.