Ok, i have been trying to create a simple search engine with python and got a few errors which im not sure how it fix.
This program allows the user to enter a URL and the program will list URLs connected to that link and allow bring back all keywords. It then asks the user to search for a specific keyword and that's when the error occurs.
I have used some of my own code and some i got of the web and can seem to get it to work!
Basically the error is the following: 'dict' object has no attribute 'append'.
Is it because i'm using lists and not the dictionary? How would i fix it?
Im quite new to python so apologies
P.S i have been using it on one of my own small closed websites and not out on the web.
Cheers
This program allows the user to enter a URL and the program will list URLs connected to that link and allow bring back all keywords. It then asks the user to search for a specific keyword and that's when the error occurs.
I have used some of my own code and some i got of the web and can seem to get it to work!
Basically the error is the following: 'dict' object has no attribute 'append'.
Is it because i'm using lists and not the dictionary? How would i fix it?
Code:
import urllib2
max_limit=5
def get_page(url):
try:
f = urllib.urlopen(url)
page = f.read()
f.close()
return page
except:
return ""
return ""
def getAllNewLinksOnPage(page,prevLinks):
response = urllib2.urlopen(page)
html = response.read()
links,pos,allFound=[],0,False
while not allFound:
aTag=html.find("<a href=",pos)
if aTag>-1:
href=html.find('"',aTag+1)
endHref=html.find('"',href+1)
url=html[href+1:endHref]
if url[:7]=="http://":
if url[-1]=="/":
url=url[:-1]
if not url in links and not url in prevLinks:
links.append(url)
print url
closeTag=html.find("</a>",aTag)
pos=closeTag+1
else:
allFound=True
return links
def getLinks(url):
toCrawl=[url]
crawled=[]
while toCrawl:
url=toCrawl.pop()
crawled.append(url)
newLinks=getAllNewLinksOnPage(url,crawled)
toCrawl=list(set(toCrawl)|set(newLinks))
return crawled
url=raw_input("Enter a URL to Search:")
linksCrawled = getLinks(url)
print("----------------------")
def addToIndex(index,keyword,url):
for entry in index:
if entry[0]==word:
if not url in entry[1]:
entry[1].append(url)
return
index.append([word,[url]])
index=[]
response = urllib2.urlopen(url)
html = response.read()
pageText=""
html=html[html.find("<body")+5:html.find("</body>")]
finished=False
while not finished:
nextCloseTag=html.find(">")
nextOpenTag=html.find("<")
if nextOpenTag>-1:
content=" ".join(html[nextCloseTag+1:nextOpenTag].strip().split())
pageText=pageText+" "+content
html=html[nextOpenTag+1:]
else:
finished=True
for word in pageText.split():
if word[0].isalnum() and len(word)>4:
addToIndex(index,word,url)
print "{} unique words found".format(len(index))
print index
print("----------------------")
def compute_ranks(graph):
d=0.8
numloops=10
ranks={}
npages=len(graph)
for page in graph:
ranks[page]=1.0/npages
for i in range(0,numloops):
newranks={}
for page in graph:
newrank=(1-d)/npages
for node in graph:
if page in graph[node]:
newrank=newrank+d*ranks[node]/len(graph[node])
newranks[page]=newrank
ranks=newranks
return ranks
def Crawl_web(seed):
tocrawl=[seed]
crawled=[]
index={}
graph={}
global max_limit
while tocrawl:
p=tocrawl.pop()
if p not in crawled:
max_limit-=1
print max_limit
if max_limit<=0:
break
c=get_page(p)
addToIndex(index,p,c)
f=get_all_links(c)
union(tocrawl,f)
graph[p]=f
crawled.append(p)
return crawled,index,graph
def QuickSort(pages,ranks):
if len(pages)>1:
piv=ranks[pages[0]]
i=1
j=1
for j in range(1,len(pages)):
if ranks[pages[j]]>piv:
pages[i],pages[j]=pages[j],pages[i]
i+=1
pages[i-1],pages[0]=pages[0],pages[i-1]
QuickSort(pages[1:i],ranks)
QuickSort(pages[i+1:len(pages)],ranks)
def Look_up_new(index,ranks,keyword):
pages=Look_up(index,keyword)
print '\nPrinting the results as is with page rank\n'
for i in pages:
print i+" --> "+str(ranks[i])#Displaying the lists, so that you can see the page rank along side
QuickSort(pages,ranks)
print "\nAfter Sorting the results by page rank\n"
it=0
for i in pages:
it+=1
print str(it)+'.\t'+i+'\n'
#print index
print "Enter What you want to search"
search_term=raw_input()
try:
print "Enter the depth you wanna go"
max_limit=int(raw_input())
except:
f=None
print '\nStarted crawling, presently at depth..'
crawled,index,graph=Crawl_web(url)#printing all the links
ranks=compute_ranks(graph)#Calculating the page ranks
Look_up_new(index,ranks,search_term)
Im quite new to python so apologies
P.S i have been using it on one of my own small closed websites and not out on the web.
Cheers
Last edited: