Help with python search engine!!

drumdogg · 4 Nov 2013 at 09:49

Ok, i have been trying to create a simple search engine with python and got a few errors which im not sure how it fix.

This program allows the user to enter a URL and the program will list URLs connected to that link and allow bring back all keywords. It then asks the user to search for a specific keyword and that's when the error occurs.

I have used some of my own code and some i got of the web and can seem to get it to work!
Basically the error is the following: 'dict' object has no attribute 'append'.
Is it because i'm using lists and not the dictionary? How would i fix it?

Code:

import urllib2
max_limit=5
def get_page(url):
        try:
                f = urllib.urlopen(url)
                page = f.read()
                f.close()
                return page
        except:        
                return ""
        return ""
def getAllNewLinksOnPage(page,prevLinks):

        response = urllib2.urlopen(page)
        html = response.read()

        links,pos,allFound=[],0,False
        while not allFound:
                aTag=html.find("<a href=",pos)
                if aTag>-1:
                        href=html.find('"',aTag+1)
                        endHref=html.find('"',href+1)
                        url=html[href+1:endHref]
                        if url[:7]=="http://":
                                if url[-1]=="/":
                                        url=url[:-1]
                                if not url in links and not url in prevLinks:
                                        links.append(url)     
                                        print url
                        closeTag=html.find("</a>",aTag)
                        pos=closeTag+1
                else:
                        allFound=True   
        return links

def getLinks(url):
        toCrawl=[url]
        crawled=[]
        while toCrawl:
                url=toCrawl.pop()
                crawled.append(url)
                newLinks=getAllNewLinksOnPage(url,crawled)
                toCrawl=list(set(toCrawl)|set(newLinks))
                
        return crawled

url=raw_input("Enter a URL to Search:")
linksCrawled = getLinks(url)


print("----------------------")


def addToIndex(index,keyword,url):
        for entry in index:
                if entry[0]==word:
                        if not url in entry[1]:
                                entry[1].append(url)
                        return
        index.append([word,[url]])
      
index=[]
response = urllib2.urlopen(url)
html = response.read()
pageText=""
html=html[html.find("<body")+5:html.find("</body>")]

finished=False
while not finished:
        nextCloseTag=html.find(">")
        nextOpenTag=html.find("<")
        if nextOpenTag>-1:
                content=" ".join(html[nextCloseTag+1:nextOpenTag].strip().split())
                pageText=pageText+" "+content
                html=html[nextOpenTag+1:]
        else:
                finished=True
                
        for word in pageText.split():
                if word[0].isalnum() and len(word)>4:
                        addToIndex(index,word,url)
                        
print "{} unique words found".format(len(index))

print index

print("----------------------")

def compute_ranks(graph):
        d=0.8
        numloops=10
        ranks={}
        npages=len(graph)
        for page in graph:
                ranks[page]=1.0/npages
        for i in range(0,numloops):
                newranks={}
                for page in graph:
                        newrank=(1-d)/npages
                        for node in graph:
                                if page in graph[node]:
                                        newrank=newrank+d*ranks[node]/len(graph[node])
                        newranks[page]=newrank
                ranks=newranks
        return ranks
        
def Crawl_web(seed):
        tocrawl=[seed]
        crawled=[]
        index={}
        graph={}
        global max_limit
        while tocrawl:
                p=tocrawl.pop()
                if p not in crawled:
                        max_limit-=1
                        print max_limit
                        if max_limit<=0:
                                break
                        c=get_page(p)
                        addToIndex(index,p,c)
                        f=get_all_links(c)
                        union(tocrawl,f)
                        graph[p]=f
                        crawled.append(p)
        return crawled,index,graph 

def QuickSort(pages,ranks):
        if len(pages)>1:
                piv=ranks[pages[0]]
                i=1
                j=1
                for j in range(1,len(pages)):
                        if ranks[pages[j]]>piv:
                                pages[i],pages[j]=pages[j],pages[i]
                                i+=1
                pages[i-1],pages[0]=pages[0],pages[i-1]
                QuickSort(pages[1:i],ranks)
                QuickSort(pages[i+1:len(pages)],ranks)


def Look_up_new(index,ranks,keyword):
        pages=Look_up(index,keyword)
        print '\nPrinting the results as is with page rank\n'
        for i in pages:
                print i+" --> "+str(ranks[i])#Displaying the lists, so that you can see the page rank along side
        QuickSort(pages,ranks)
        print "\nAfter Sorting the results by page rank\n"
        it=0
        for i in pages:
                it+=1
                print str(it)+'.\t'+i+'\n' 




#print index
print "Enter What you want to search"
search_term=raw_input()
try:
        print "Enter the depth you wanna go"
        max_limit=int(raw_input())
except:
        f=None
print '\nStarted crawling, presently at depth..'
crawled,index,graph=Crawl_web(url)#printing all the links


ranks=compute_ranks(graph)#Calculating the page ranks
Look_up_new(index,ranks,search_term)

Im quite new to python so apologies

P.S i have been using it on one of my own small closed websites and not out on the web.

Cheers

drumdogg · 4 Nov 2013 at 19:18

means you are using a dict where a list (actually anything with the method 'append') is expected. If you analyse the call stack and check your code you should find where the dict comes from (probably Crawl_web passing index into addToIndex).

Is it changing the index={} to index=[]??

Did that but there is still a pill of errors. Suppose that's what happens when coping some code into your own

drumdogg · 6 Nov 2013 at 08:28

edit: can get through to selecting what depth to search, then the problem is that the functions Look_Up and get_all_links don't exist so it breaks, did you copy in all the required code as it appears you are missing some?

Sorry, here is all the code now. Got it working but there is code which I have copied in which is duplicating other code because I couldn't get certain parts working with my own code.

Code:

import urllib
import urllib2
max_limit=10
def getAllNewLinksOnPage(page,prevLinks):

        response = urllib2.urlopen(page)
        html = response.read()

        links,pos,allFound=[],0,False
        while not allFound:
                aTag=html.find("<a href=",pos)
                if aTag>-1:
                        href=html.find('"',aTag+1)
                        endHref=html.find('"',href+1)
                        url=html[href+1:endHref]
                        if url[:7]=="http://":
                                if url[-1]=="/":
                                        url=url[:-1]
                                if not url in links and not url in prevLinks:
                                        links.append(url)     
                                        print url
                        closeTag=html.find("</a>",aTag)
                        pos=closeTag+1
                else:
                        allFound=True   
        return links

def getLinks(url):
        toCrawl=[url]
        crawled=[]
        while toCrawl:
                url=toCrawl.pop()
                crawled.append(url)
                newLinks=getAllNewLinksOnPage(url,crawled)
                toCrawl=list(set(toCrawl)|set(newLinks))
                
        return crawled

url=raw_input("Enter a URL to Search:")
linksCrawled = getLinks(url)


print("----------------------")


def addToIndex(index,keyword,url):
        for entry in index:
                if entry[0]==word:
                        if not url in entry[1]:
                                entry[1].append(url)
                        return
        index.append([word,[url]])
        
      
index=[]
response = urllib2.urlopen(url)
html = response.read()
pageText=""
html=html[html.find("<body")+5:html.find("</body>")]

finished=False
while not finished:
        nextCloseTag=html.find(">")
        nextOpenTag=html.find("<")
        if nextOpenTag>-1:
                content=" ".join(html[nextCloseTag+1:nextOpenTag].strip().split())
                pageText=pageText+" "+content
                html=html[nextOpenTag+1:]
        else:
                finished=True
                
        for word in pageText.split():
                if word[0].isalnum() and len(word)>4:
                        addToIndex(index,word,url)
                        
print "{} unique words found".format(len(index))

#print index
newdict=dict(index)
print newdict

print("----------------------")
print("----------------------")
def get_page(url):#This function is just to return the webpage contents; the source of the webpage when a url is given.
        try:
                f = urllib.urlopen(url)
                page = f.read()
                f.close()
                #print page
                return page
        except:        
                return ""
        return ""
def union(a,b):#The union function merges the second list into first, with out duplicating an element of a, if it's already in a. Similar to set union operator. This function does not change b. If a=[1,2,3] b=[2,3,4]. After union(a,b) makes a=[1,2,3,4] and b=[2,3,4]
        for e in b:
                if e not in a:
                        a.append(e)


def get_next_url(page):
        start_link=page.find("a href")
        if(start_link==-1):
                return None,0
        start_quote=page.find('"',start_link)
        end_quote=page.find('"',start_quote+1)
        url=page[start_quote+1:end_quote]
        return url,end_quote
def get_all_links(page):
        links=[]
        while(True):
                url,n=get_next_url(page)
                page=page[n:]
                if url:
                        links.append(url)
                else:
                        break
        return links
def Look_up(index,keyword):#This function is for given an index, it finds the keyword in the index and returns the list of links
        #f=[]
        if keyword in index:
                return index[keyword]
        return []
#The format of element in the index is <keyword>,[<List of urls that contain the keyword>]
def add_to_index(index,url,keyword):


        if keyword in index:
                if url not in index[keyword]:
                        index[keyword].append(url)
                return
        index[keyword]=[url]
def add_page_to_index(index,url,content):#Adding the content of the webpage to the index
        for i in content.split():
                add_to_index(index,url,i)


def compute_ranks(graph):#Computing ranks for a given graph -> for all the links in it
        d=0.8
        numloops=10
        ranks={}
        npages=len(graph)
        for page in graph:
                ranks[page]=1.0/npages
        for i in range(0,numloops):
                newranks={}
                for page in graph:
                        newrank=(1-d)/npages
                        for node in graph:
                                if page in graph[node]:
                                        newrank=newrank+d*ranks[node]/len(graph[node])
                        newranks[page]=newrank
                ranks=newranks
        return ranks
        
def Crawl_web(seed):#The website to act as seed page is given as input
        tocrawl=[seed]
        crawled=[]
        index={}
        graph={}#new graph
        global max_limit
        while tocrawl:
                p=tocrawl.pop()
                if p not in crawled:
                        max_limit-=1
                        print max_limit
                        if max_limit<=0:
                                break
                        c=get_page(p)
                        add_page_to_index(index,p,c)
                        f=get_all_links(c)
                        union(tocrawl,f)
                        graph[p]=f
                        crawled.append(p) 
        return crawled,index,graph #Returns the list of links


#print index        

def QuickSort(pages,ranks):#Sorting in descending order
        if len(pages)>1:
                piv=ranks[pages[0]]
                i=1
                j=1
                for j in range(1,len(pages)):
                        if ranks[pages[j]]>piv:
                                pages[i],pages[j]=pages[j],pages[i]
                                i+=1
                pages[i-1],pages[0]=pages[0],pages[i-1]
                QuickSort(pages[1:i],ranks)
                QuickSort(pages[i+1:len(pages)],ranks)


def Look_up_new(index,ranks,keyword):
        pages=Look_up(index,keyword)
        print '\nPrinting the results as is with page rank\n'
        for i in pages:
                print i+" --> "+str(ranks[i])
        QuickSort(pages,ranks)
        print "\nAfter Sorting the results by page rank\n"
        it=0
        for i in pages: 
                it+=1
                print str(it)+'.\t'+i+'\n' 


print "Enter What you want to search:"
search_term=raw_input()
try:
        print "Enter the depth you want to Search(Max=10):"
        max_limit=int(raw_input())
except:
        f=None
#print '\nStarted crawling, presently at depth..'
crawled,index,graph=Crawl_web(url)#printing all the links


ranks=compute_ranks(graph)#Calculating the page ranks
Look_up_new(index,ranks,search_term)

Im sure it could be constructed a lot better but a least its now working!

Thanks

Help with python search engine!!

drumdogg

drumdogg

drumdogg

drumdogg

drumdogg

drumdogg