I have been working on PageRank for the CS50 AI course, my code:
import os
import random
import re
import sys
from numpy.random import choice
DAMPING = 0.85
SAMPLES = 10000
def main():
if len(sys.argv) != 2:
sys.exit("Usage: python pagerank.py corpus")
corpus = crawl(sys.argv[1])
k = sample_pagerank(corpus, DAMPING, SAMPLES)
print(f"PageRank Results from Sampling (n = {SAMPLES})")
for page in sorted(k):
print(f" {page}: {k[page]:.4f}")
k = iterate_pagerank(corpus, DAMPING)
print(f"PageRank Results from Iteration")
for page in sorted(k):
print(f" {page}: {k[page]:.4f}")
def crawl(directory):
"""
Parse a directory of HTML pages and check for links to other pages.
Return a dictionary where each key is a page, and values are
a list of all other pages in the corpus that are linked to by the page.
"""
pages = dict()
# Extract all links from HTML files
for filename in os.listdir(directory):
if not filename.endswith(".html"):
continue
with open(os.path.join(directory, filename)) as f:
contents = f.read()
links = re.findall(r"<as+(?:[^>]*?)href="([^"]*)"", contents)
pages[filename] = set(links) - {filename}
# Only include links to other pages in the corpus
for filename in pages:
pages[filename] = set(
link for link in pages[filename]
if link in pages
)
return pages
def transition_model(corpus, page, damping_factor):
"""
Return a probability distribution over which page to visit next,
given a current page.
With probability `damping_factor`, choose a link at random
linked to by `page`. With probability `1 - damping_factor`, choose
a link at random chosen from all pages in the corpus.
"""
l = len(corpus[page]) # number of links for that page
k = dict()
if l == 0:
for i in corpus.keys():
k[i] = 1/len(corpus)
else:
for i in corpus[page]:
k[i] = damping_factor / l + (1 - damping_factor) / (l + 1)
k[page] = (1 - damping_factor) / (l + 1)
return k
def sample_pagerank(corpus, damping_factor, n):
"""
Return PageRank values for each page by sampling `n` pages
according to transition model, starting with a page at random.
Return a dictionary where keys are page names, and values are
their estimated PageRank value (a value between 0 and 1). All
PageRank values should sum to 1.
"""
k = dict()
counts = dict()
pages = list(corpus.keys())
page = random.choice(pages)
for i in range(0, n - 1):
probs = transition_model(corpus, page, damping_factor)
psub = list(probs.keys())
probs = list(probs.values())
if page in counts:
counts[page] += 1
else:
counts[page] = 1
page = choice(psub, p=probs)
for i in counts.keys():
k[i] = counts[i] / n
return k
def iterate_pagerank(corpus, damping_factor):
"""
Return PageRank values for each page by iteratively updating
PageRank values until convergence.
Return a dictionary where keys are page names, and values are
their estimated PageRank value (a value between 0 and 1). All
PageRank values should sum to 1.
"""
# if any page has no links give it a link to every page
for i in corpus:
if len(corpus[i]) == 0:
corpus[i] = set(corpus.keys())
N = len(corpus)
# initialize ranks
k = {i: 1 / N for i in corpus}
def pagerank(page):
# start the algorithm
r = (1 - damping_factor) / N
rt = 0
# second part of algorithm
for i in corpus:
if page in corpus[i]:
rt += k[i] / len(corpus[i])
rt = damping_factor * rt
return r + rt
while True:
# new ranks
kn = {i: pagerank(i) for i in corpus}
# maximum changes to ranks
c = max(abs(kn[i] - k[i]) for i in corpus)
k = kn
# do this until changes becomes less that 0.001
if c < 0.001:
break
return k
if __name__ == "__main__":
main()
When I check it using check50 it gives this error relating to the iterate_pagerank
function:
check50 error
Saying that because pagerank 1 was 0.2282 and instead should be in the range [0.05996, 0.15996].
Because check50 gave the corpus and the expected range, I tried replacing line 15 with:
corpus = {'1': {'2'}, '2': {'3', '1'}, '3': {'2', '4'}, '4': {'2'}, '5': {'6'}, '6': {'7', '5'}, '7': {'6', '8'}, '8': {'6'}}
My result:
PageRank Results from Iteration
1: 0.1101
2: 0.2144
3: 0.1101
4: 0.0653
5: 0.1101
6: 0.2144
7: 0.1101
8: 0.0653
But the rank for page 1 is 0.1101, and that is between the range 0.05996 to 0.15996.
Is check50 wrong or am I missing something?
Sergei Srednyak is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.