📖Document Search

Requirements

Python environment with necessary packages installed.
GenAI Stack library and its dependencies.
Weaviate, an open-source vector search engine, installed and configured if it is used as the underlying VectorDB.
A dataset or source documents for indexing and searching.

from genai_stack.embedding.langchain import LangchainEmbedding[doc-search.ipynb](doc-search.ipynb)
from genai_stack.etl.langchain import LangchainETL
from genai_stack.stack.stack import Stack
from genai_stack.vectordb import ChromaDB
from genai_stack.vectordb.weaviate_db import Weaviate

Search single document

Search a single document using etl and vector database.

embedding = LangchainEmbedding.from_kwargs(
    name="HuggingFaceEmbeddings",
    fields={
        "model_name": "sentence-transformers/all-mpnet-base-v2",
        "model_kwargs": {"device": "cpu"},
        "encode_kwargs": {"normalize_embeddings": False},
    }
)
chromadb = ChromaDB.from_kwargs()
etl = LangchainETL.from_kwargs(
    name="PyPDFLoader", fields={
        "file_path": "<your_file>.pdf",
    }
)
stack = Stack(
    model=None,
    embedding=embedding,
    vectordb=chromadb,
    etl=etl
)

doc = chromadb.similarity_search("Who provide technical assistance to computer system users?")

for i in doc:
    print(i.metadata)

output

{'page': 2, 'source': '/home/akshaj/Documents/AIPlanet/DocumentSearch/data/2A2C2V4WI5YRDJHR26XUD4IAULIYGTMA.pdf'}
{'page': 2, 'source': '/home/akshaj/Documents/AIPlanet/DocumentSearch/data/2A2C2V4WI5YRDJHR26XUD4IAULIYGTMA.pdf'}
{'page': 2, 'source': '/home/akshaj/Documents/AIPlanet/DocumentSearch/data/2A2C2V4WI5YRDJHR26XUD4IAULIYGTMA.pdf'}
{'page': 1, 'source': '/home/akshaj/Documents/AIPlanet/DocumentSearch/data/2A2C2V4WI5YRDJHR26XUD4IAULIYGTMA.pdf'}

Search multiple documents

Search a directory containing documents. Returns a list of documents with path and page number.

embedding = LangchainEmbedding.from_kwargs(
    name="HuggingFaceEmbeddings",
    fields={
        "model_name": "sentence-transformers/all-mpnet-base-v2",
        "model_kwargs": {"device": "cpu"},
        "encode_kwargs": {"normalize_embeddings": False},
    }
)
db = Weaviate.from_kwargs(
    url="http://localhost:8080/",
    index_name="Testing",
    text_key="test",
    attributes=["source", "page"]
)

file_folder = "<your_file_directory>"

import os
os.listdir(file_folder)

output

['2A2C2V4WI5YRDJHR26XUD4IAULIYGTMA.pdf',
 '2ED27NR7CISW7J4PHXXBZ6OFPVDFHMFB.pdf',
 '2EDEPZ4VHTLPTWSZR6FAVUJ3B2ZVSIPS.pdf',
 '2F73J4NP2YHKVISKHDIDJ7RGPDKTQZ7D.pdf',
 '2KQDEYIMQDVT2DUARRCJV5HEUYY2HO7H.pdf',
 '2LVOKCURIEQKLK43I6T7QLYYQX3RQUXX.pdf',
 '2QCQAIXCPZZPZPEEBHJT4WUB5BA42DCP.pdf',
 '2QWDF5JK4N7WQ4NQRZRLF4CYOUF32WTR.pdf']

etl = LangchainETL.from_kwargs(
    name="DirectoryLoader", fields={
        "path": file_folder,
        "glob" : "*.pdf",
        "loader_cls":"langchain.document_loaders.PyPDFLoader",
        "use_multithreading":True,
        "show_progress": True
    }
)
stack = Stack(
    model=None,
    embedding=embedding,
    vectordb=db,
    etl=etl
)

doc = db.similarity_search("Who provide technical assistance to computer system users?")

[{
    "content": i.page_content,
    "page": i.metadata["page"],
    "path": i.metadata["source"]
} for i in doc]

output

[{'content': 'Revised  January 10, 2014  \n \nJUDICIAL INTERN  HIRING INFORMATION  \nLorna G. Schofield , United States District Judge  \n \nChambers  Contact Information :         \nUnited States District Court      \nSouthern District of New York              \n40 Centre Street, Room 20 1      \nNew York, NY  10007  \n(212) 805 -0288 \n \nPositions :  Judge Schofield hires first - and second -year law students as interns during the school \nyear and for summer employment .  During the school year, interns must be available for a \nsemester at least 20 hours a week.  During the summer, interns must be available to work full \ntime for at least eight weeks.   \nApplications :  Applications should include a resume, transcript and writi ng sample.   First-year \nstudents should not apply until they have received grades from all of their first semester classes.   ',
  'page': 0,
  'path': '/home/akshaj/Documents/AIPlanet/DocumentSearch/data-2/2KQDEYIMQDVT2DUARRCJV5HEUYY2HO7H.pdf'},
 {'content': 'Revised  January 10, 2014  \n \nJUDICIAL INTERN  HIRING INFORMATION  \nLorna G. Schofield , United States District Judge  \n \nChambers  Contact Information :         \nUnited States District Court      \nSouthern District of New York              \n40 Centre Street, Room 20 1      \nNew York, NY  10007  \n(212) 805 -0288 \n \nPositions :  Judge Schofield hires first - and second -year law students as interns during the school \nyear and for summer employment .  During the school year, interns must be available for a \nsemester at least 20 hours a week.  During the summer, interns must be available to work full \ntime for at least eight weeks.   \nApplications :  Applications should include a resume, transcript and writi ng sample.   First-year \nstudents should not apply until they have received grades from all of their first semester classes.   ',
  'page': 0,
  'path': '/home/akshaj/Documents/AIPlanet/DocumentSearch/data-2/2KQDEYIMQDVT2DUARRCJV5HEUYY2HO7H.pdf'},
 {'content': 'Revised  January 10, 2014  \n \nJUDICIAL INTERN  HIRING INFORMATION  \nLorna G. Schofield , United States District Judge  \n \nChambers  Contact Information :         \nUnited States District Court      \nSouthern District of New York              \n40 Centre Street, Room 20 1      \nNew York, NY  10007  \n(212) 805 -0288 \n \nPositions :  Judge Schofield hires first - and second -year law students as interns during the school \nyear and for summer employment .  During the school year, interns must be available for a \nsemester at least 20 hours a week.  During the summer, interns must be available to work full \ntime for at least eight weeks.   \nApplications :  Applications should include a resume, transcript and writi ng sample.   First-year \nstudents should not apply until they have received grades from all of their first semester classes.   ',
  'page': 0,
  'path': '/home/akshaj/Documents/AIPlanet/DocumentSearch/data-2/2KQDEYIMQDVT2DUARRCJV5HEUYY2HO7H.pdf'},
 {'content': 'Revised  January 10, 2014  \n \nJUDICIAL INTERN  HIRING INFORMATION  \nLorna G. Schofield , United States District Judge  \n \nChambers  Contact Information :         \nUnited States District Court      \nSouthern District of New York              \n40 Centre Street, Room 20 1      \nNew York, NY  10007  \n(212) 805 -0288 \n \nPositions :  Judge Schofield hires first - and second -year law students as interns during the school \nyear and for summer employment .  During the school year, interns must be available for a \nsemester at least 20 hours a week.  During the summer, interns must be available to work full \ntime for at least eight weeks.   \nApplications :  Applications should include a resume, transcript and writi ng sample.   First-year \nstudents should not apply until they have received grades from all of their first semester classes.   ',
  'page': 0,
  'path': '/home/akshaj/Documents/AIPlanet/DocumentSearch/data-2/2KQDEYIMQDVT2DUARRCJV5HEUYY2HO7H.pdf'}]

doc = db.similarity_search("Chambers Contact Information:")

for i in doc:
    print(i.metadata)

output

{'page': 0, 'source': '/home/akshaj/Documents/AIPlanet/DocumentSearch/data-2/2KQDEYIMQDVT2DUARRCJV5HEUYY2HO7H.pdf'}
{'page': 0, 'source': '/home/akshaj/Documents/AIPlanet/DocumentSearch/data-2/2KQDEYIMQDVT2DUARRCJV5HEUYY2HO7H.pdf'}
{'page': 0, 'source': '/home/akshaj/Documents/AIPlanet/DocumentSearch/data-2/2KQDEYIMQDVT2DUARRCJV5HEUYY2HO7H.pdf'}
{'page': 0, 'source': '/home/akshaj/Documents/AIPlanet/DocumentSearch/data-2/2KQDEYIMQDVT2DUARRCJV5HEUYY2HO7H.pdf'}

Checkout the notebook here for more details.

Last updated 1 year ago