Skip to content

Adding a Search Engine to the repo #2196

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
May 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions Search_Engine/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Python Program to search through various documents and return the documents containing the search term. Algorithm involves using a reverse index to store each word in each document where a document is defined by an index. To get the document that contains a search term, we simply find an intersect of all the words in the search term, and using the resulting indexes, retrieve the document(s) that contain these words

To use directly, run

```python3 backend.py```

To use a gui, run

```python3 frontend.py```
135 changes: 135 additions & 0 deletions Search_Engine/backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import sqlite3
import test_data
import ast
import json

class SearchEngine:
"""
It works by building a reverse index store that maps
words to an id. To find the document(s) that contain
a certain search term, we then take an intersection
of the ids
"""

def __init__(self):
"""
Returns - None
Input - None
----------
- Initialize database. we use sqlite3
- Check if the tables exist, if not create them
- maintain a class level access to the database
connection object
"""
self.conn = sqlite3.connect("searchengine.sqlite3", autocommit=True)
cur = self.conn.cursor()
res = cur.execute("SELECT name FROM sqlite_master WHERE name='IdToDoc'")
tables_exist = res.fetchone()

if not tables_exist:
self.conn.execute("CREATE TABLE IdToDoc(id INTEGER PRIMARY KEY, document TEXT)")
self.conn.execute('CREATE TABLE WordToId (name TEXT, value TEXT)')
cur.execute("INSERT INTO WordToId VALUES (?, ?)", ("index", "{}",))

def index_document(self, document):
"""
Returns - string
Input - str: a string of words called document
----------
Indexes the document. It does this by performing two
operations - add the document to the IdToDoc, then
adds the words in the document to WordToId
- takes in the document (str)
- passes the document to a method to add the document
to IdToDoc
- retrieves the id of the inserted document
- uses the id to call the method that adds the words of
the document to the reverse index WordToId if the word has not
already been indexed
"""
row_id = self._add_to_IdToDoc(document)
cur = self.conn.cursor()
reverse_idx = cur.execute("SELECT value FROM WordToId WHERE name='index'").fetchone()[0]
reverse_idx = json.loads(reverse_idx)
document = document.split()
for word in document:
if word not in reverse_idx:
reverse_idx[word] = [row_id]
else:
if row_id not in reverse_idx[word]:
reverse_idx[word].append(row_id)
reverse_idx = json.dumps(reverse_idx)
cur = self.conn.cursor()
result = cur.execute("UPDATE WordToId SET value = (?) WHERE name='index'", (reverse_idx,))
return("index successful")

def _add_to_IdToDoc(self, document):
"""
Returns - int: the id of the inserted document
Input - str: a string of words called `document`
---------
- use the class-level connection object to insert the document
into the db
- retrieve and return the row id of the inserted document
"""
cur = self.conn.cursor()
res = cur.execute("INSERT INTO IdToDoc (document) VALUES (?)", (document,))
return res.lastrowid

def find_documents(self, search_term):
"""
Returns - <class method>: the return value of the _find_documents_with_idx method
Input - str: a string of words called `search_term`
---------
- retrieve the reverse index
- use the words contained in the search term to find all the idxs
that contain the word
- use idxs to call the _find_documents_with_idx method
- return the result of the called method
"""
cur = self.conn.cursor()
reverse_idx = cur.execute("SELECT value FROM WordToId WHERE name='index'").fetchone()[0]
reverse_idx = json.loads(reverse_idx)
search_term = search_term.split(" ")
all_docs_with_search_term = []
for term in search_term:
if term in reverse_idx:
all_docs_with_search_term.append(reverse_idx[term])

if not all_docs_with_search_term: # the search term does not exist
return []

common_idx_of_docs = set(all_docs_with_search_term[0])
for idx in all_docs_with_search_term[1:]:
common_idx_of_docs.intersection_update(idx)

if not common_idx_of_docs: # the search term does not exist
return []

return self._find_documents_with_idx(common_idx_of_docs)

def _find_documents_with_idx(self, idxs):
"""
Returns - list[str]: the list of documents with the idxs
Input - list of idxs
---------
- use the class-level connection object to retrieve the documents that
have the idx in the input list of idxs.
- retrieve and return these documents as a list
"""
idxs = list(idxs)
cur = self.conn.cursor()
sql="SELECT document FROM IdToDoc WHERE id in ({seq})".format(
seq=','.join(['?']*len(idxs))
)
result = cur.execute(sql, idxs).fetchall()
return(result)


if __name__ == "__main__":
se = SearchEngine()
se.index_document("we should all strive to be happy and happy again")
print(se.index_document("happiness is all you need"))
se.index_document("no way should we be sad")
se.index_document("a cheerful heart is a happy one even in Nigeria")
print(se.find_documents("happy"))
37 changes: 37 additions & 0 deletions Search_Engine/frontend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from tkinter import *
from tkinter import messagebox
import backend


def add_document():
document = add_documents_entry.get()
se = backend.SearchEngine()
print(se.index_document(document))

def find_term():
term = find_term_entry.get()
se = backend.SearchEngine()
print(se.find_documents(term))

if __name__ == "__main__":
root = Tk()
root.title("Registration Form")
root.geometry('300x300')

add_documents_label = Label(root, text="Add Document:")
add_documents_label.pack()
add_documents_entry = Entry(root)
add_documents_entry.pack()

add_document_button = Button(root, text="add", command=add_document)
add_document_button.pack()

find_term_label = Label(root, text="Input term to search:")
find_term_label.pack()
find_term_entry = Entry(root)
find_term_entry.pack()

search_term_button = Button(root, text="search", command=find_term)
search_term_button.pack()

root.mainloop()
8 changes: 8 additions & 0 deletions Search_Engine/test_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
documents = [
"we should all strive to be happy",
"happiness is all you need",
"a cheerful heart is a happy one",
"no way should we be sad"
]

search = "happy"