In [1]:
# Dependencies
from datetime import date, timedelta  # date handling for fetching recent news
from IPython import display  # for pretty printing
import json  # for parsing the JSON api responses and model outputs
from numpy import dot  # for cosine similarity
import openai  # for using GPT and getting embeddings
import os  # for loading environment variables
import requests  # for making the API requests
from tqdm.notebook import tqdm  # for printing progress bars

# Load environment variables
GPT_MODEL = "gpt-3.5-turbo"


# Helper functions
def json_gpt(input: str):
    completion = openai.ChatCompletion.create(
        model=GPT_MODEL,
        messages=[
            {"role": "system", "content": "Output only valid JSON"},
            {"role": "user", "content": input},
        ],
        temperature=0.5,
    )

    text = completion.choices[0].message.content
    parsed = json.loads(text)

    return parsed


def embeddings(input: list) -> list:
    response = openai.Embedding.create(model="text-embedding-ada-002", input=input)
    return [data.embedding for data in response.data]

## 1. Search

> It all starts with a user question.


In [2]:
# User asks a question
USER_QUESTION = "Why hire Daniel?"

> Now, in order to be as exhaustive as possible, we use the model to generate a list of diverse queries based on this question.


In [3]:
QUERIES_INPUT = f"""
You have access to a search API that returns documents from danielsgriffin.com.
Generate an array of search queries that are relevant to this question.
Use a variation of related keywords for the queries, trying to be as general as possible.
Include as many queries as you can think of, including and excluding terms.
For example, include queries like ['keyword_1 keyword_2', 'keyword_1', 'keyword_2'].
Be creative. The more queries you include, the more likely you are to find relevant results.

User question: {USER_QUESTION}

Format: {{"queries": ["query_1", "query_2", "query_3"]}}
"""

queries = json_gpt(QUERIES_INPUT)["queries"]

# Let's include the original question as well for good measure
queries.append(USER_QUESTION)

queries

['Why hire Daniel?',
 'Reasons to hire Daniel',
 'Benefits of hiring Daniel',
 'Advantages of hiring Daniel',
 'Why should I hire Daniel?',
 'Is Daniel a good hire?',
 'What makes Daniel a great hire?',
 'What sets Daniel apart?',
 'What can Daniel bring to the table?',
 'What skills does Daniel have?',
 "Daniel's qualifications",
 "Daniel's experience",
 "Daniel's expertise",
 "Daniel's track record",
 "Daniel's achievements",
 "Daniel's strengths",
 "Daniel's unique abilities",
 "Daniel's value proposition",
 "Daniel's contributions",
 "Daniel's impact",
 "Daniel's professionalism",
 "Daniel's work ethic",
 "Daniel's dedication",
 "Daniel's reliability",
 "Daniel's problem-solving skills",
 "Daniel's communication skills",
 "Daniel's leadership skills",
 "Daniel's teamwork",
 "Daniel's creativity",
 "Daniel's adaptability",
 "Daniel's passion",
 "Daniel's motivation",
 "Daniel's commitment",
 'Why hire Daniel?']

In [13]:
def search_dsg_lunr(query: str) -> dict:
    response = requests.get(
        "http://localhost:3000/search",
        params={
            "q": query,
        },
    )

    return response.json()


results = []

for query in tqdm(queries):
    result = search_dsg_lunr(query)
    if result["status"] == "ok":
        results = results + result["documents"]
    else:
        raise Exception(result["message"])

# remove duplicates & retrieve docData as documents
documents = list({result["docData"]["url"]: result["docData"] for result in results}.values())
print(documents[0])

print("Total number of documents:", len(documents))
print("Top 5 documents of query 1:", "\n")

for document in documents[0:5]:
    print("Title:", document["title"])
    print("Description:", document["snippet"])
    print("Content:", document["content"][0:100] + "...")
    print()


HBox(children=(FloatProgress(value=0.0, max=34.0), HTML(value='')))


{'id': 'hire-me-html', 'type': 'pages', 'date': '', 'title': 'Hire me.', 'boosts': '', 'tags': '', 'content': 'Many people are now seeing that search is much more than ten blue links, much more than one company. We have a big chance to really change search for the better. Will we?I have a Ph.D.\xa0in Information Science from the School of Information at the University of California, Berkeley. I’m a technically-skilled qualitative researcher focused on web search tools and practices. I use interviews and digital ethnography to research how we talk about, imagine, know, build, and practice different ways of searching. My dissertation looked at how data engineers effectively use general-purpose web search at work.I’m looking at finding or creating opportunities in industry for me to bring my expertise in search and research to contribute to better understanding and improving search tools and practices, amidst changes around generative AI.Generative search and search-like tools are shifti

> As we can see, oftentimes, the search queries will return a large number of results, many of which are not relevant to the original question asked by the user. In order to improve the quality of the final answer, we use embeddings to re-rank and filter the results.

# 2. Re-rank

> Drawing inspiration from [HyDE (Gao et al.)](https://arxiv.org/abs/2212.10496), we first generate a hypothetical ideal answer to rerank our compare our results against. This helps prioritize results that look like good answers, rather than those similar to our question. Here’s the prompt we use to generate our hypothetical answer.


In [5]:
HA_INPUT = f"""
Task: Generate a hypothetical answer to the user's question. This answer will be used to rank search results. 
Pretend you have all the information you need to answer, but don't use any actual facts. Instead, use placeholders
like NAME did something, or NAME said something at PLACE. 

User question: {USER_QUESTION}

Format: {{"hypotheticalAnswer": "hypothetical answer text"}}
"""

hypothetical_answer = json_gpt(HA_INPUT)["hypotheticalAnswer"]

hypothetical_answer


"Daniel has a proven track record of success in his previous roles. He consistently exceeded targets and delivered exceptional results. His colleagues describe him as a highly motivated and dedicated team player. Daniel's strong analytical skills and problem-solving abilities make him an invaluable asset to any organization. Hiring Daniel would bring a fresh perspective and innovative ideas to the team, driving growth and success."

> Now, let's generate embeddings for the search results and the hypothetical answer. We then calculate the cosine distance between these embeddings, giving us a semantic similarity metric. Note that we can simply calculate the dot product in lieu of doing a full cosine similarity calculation since the OpenAI embeddings are returned normalized in our API.

In [6]:
import os
import hashlib
import pickle

# Function to generate a hash for a document
def hash_document(document):
    document_string = f"{document['title']} {document.get('description', '')} {document.get('content', '')[:100]}"
    return hashlib.sha256(document_string.encode()).hexdigest()

# Load existing embeddings and hashes
if os.path.exists('hypothetical_answer_embedding.pkl') and os.path.exists('document_embeddings.pkl') and os.path.exists('document_hashes.pkl'):
    with open('hypothetical_answer_embedding.pkl', 'rb') as f:
        hypothetical_answer_embedding = pickle.load(f)

    with open('document_embeddings.pkl', 'rb') as f:
        document_embeddings = pickle.load(f)

    with open('document_hashes.pkl', 'rb') as f:
        document_hashes = pickle.load(f)
else:
    hypothetical_answer_embedding = embeddings(hypothetical_answer)[0]
    document_embeddings = []
    document_hashes = set()

# Process each document
for document in documents:
    document_hash = hash_document(document)
    if document_hash not in document_hashes:
        document_string = f"{document['title']} {document.get('description', '')} {document.get('content', '')[:100]}"
        document_embedding = embeddings([document_string])[0]
        document_embeddings.append(document_embedding)
        document_hashes.add(document_hash)

# Save the embeddings and hashes to a file
with open('hypothetical_answer_embedding.pkl', 'wb') as f:
    pickle.dump(hypothetical_answer_embedding, f)

with open('document_embeddings.pkl', 'wb') as f:
    pickle.dump(document_embeddings, f)

with open('document_hashes.pkl', 'wb') as f:
    pickle.dump(document_hashes, f)

# Calculate cosine similarity
cosine_similarities = []
for document_embedding in document_embeddings:
    cosine_similarities.append(dot(hypothetical_answer_embedding, document_embedding))

cosine_similarities[0:10]


[0.7873335622875117,
 0.6441566899333357,
 0.7011248224436103,
 0.6610630574919335,
 0.7047552800788842,
 0.6691687613776545,
 0.6979306941021742,
 0.7168695282905165,
 0.7117792356986318,
 0.6756729978316052]

> Finally, we use these similarity scores to sort and filter the results.

In [7]:
scored_documents = zip(documents, cosine_similarities)

# Sort documents by cosine similarity
scored_documents = sorted(scored_documents, key=lambda x: x[1], reverse=True)

# Print top 5 documents
print("Top 5 documents:", "\n")

for document, score in scored_documents[0:5]:
    print("Title:", document["title"])
    print("Snippet:", document["snippet"])
    print("Content:", document["content"][0:100] + "...")
    print("Score:", score)
    print()


Top 5 documents: 

Title: sharing interfaces for generative search responses
Snippet: This document discusses the concept of sharing interfaces for generative search responses. It explores the importance of sharing search results and documents the author's research on sharing and repairing searching. The document also mentions several tools that provide a sharing interface, such as OpenAI's ChatGPT, Phind, You.com's YouChat, and Inflection AI's Pi.
Content: This is a doc about generative search (and search-like) tool support for sharing responses/results t...
Score: 0.7944482132146893

Title: 6. Owning searching
Snippet: This document explores the solitary and secretive nature of web searching among data engineers, who rely heavily on search but often keep their practices hidden from colleagues. The author suggests that the lack of discussion around search and delegation of search tasks to individuals has resulted in poorer learning and marginalization of certain individuals. The docum

In [8]:
formatted_top_results = [
    {
        "title": document["title"],
        "description": document["snippet"],
        "url": document["url"],
    }
    for document, _score in scored_documents[0:10]
]

ANSWER_INPUT = f"""
Generate an answer to the user's question based on the given search results. 
TOP_RESULTS: {formatted_top_results}
USER_QUESTION: {USER_QUESTION}

Include as much information as possible in the answer. Reference the relevant search result urls as markdown links.
"""

completion = openai.ChatCompletion.create(
    model=GPT_MODEL,
    messages=[{"role": "user", "content": ANSWER_INPUT}],
    temperature=0.5,
    stream=True,
)

text = ""
for chunk in completion:
    text += chunk.choices[0].delta.get("content", "")
    display.clear_output(wait=True)
    display.display(display.Markdown(text))

Daniel S. Griffin is a Ph.D. holder in Information Science from the University of California, Berkeley, who specializes in web search tools and practices. He is actively seeking opportunities in industry to contribute to better understanding and improving search tools and practices, particularly in generative search and search-like tools. With his background and expertise, Daniel is well-equipped to understand users and effectively communicate findings. If you are interested in connecting with Daniel or have any potential opportunities, you can find his contact information on his [Hire me](/hire-me.html) page.

Source: [Hire me](/hire-me.html)

In [9]:
text

'Daniel S. Griffin is a Ph.D. holder in Information Science from the University of California, Berkeley, who specializes in web search tools and practices. He is actively seeking opportunities in industry to contribute to better understanding and improving search tools and practices, particularly in generative search and search-like tools. With his background and expertise, Daniel is well-equipped to understand users and effectively communicate findings. If you are interested in connecting with Daniel or have any potential opportunities, you can find his contact information on his [Hire me](/hire-me.html) page.\n\nSource: [Hire me](/hire-me.html)'

In [10]:
# bonus w/ gpt-4

In [11]:
completion = openai.ChatCompletion.create(
    model='gpt-4',
    messages=[{"role": "user", "content": ANSWER_INPUT}],
    temperature=0.5,
    stream=True,
)

text = ""
for chunk in completion:
    text += chunk.choices[0].delta.get("content", "")
    display.clear_output(wait=True)
    display.display(display.Markdown(text))

Daniel S. Griffin is a highly qualified professional with a Ph.D. in Information Science from the University of California, Berkeley. He has extensive experience working with various organizations on topics such as web search, algorithmic fairness, and cybersecurity. His expertise lies in understanding users and identifying and communicating findings effectively, which makes him a valuable asset for any organization ([About](/about.html)).

Daniel is a skilled qualitative researcher who has focused on web search tools and practices. His background and expertise enable him to contribute significantly to the understanding and improvement of search tools and practices, particularly in generative search and search-like tools. He is actively seeking opportunities in the industry to leverage his skills and knowledge ([Hire me.](/hire-me.html)).

Daniel has also demonstrated his knowledge through his research. He has explored the solitary and secretive nature of web searching among data engineers, the importance of search repair practices, and the potential for technocratization of search ([6. Owning searching](/diss/owning_searching.html), [5. Repairing searching](/diss/repairing_searching.html)).

Furthermore, he has taught a course on understanding change in web search at Michigan State University, indicating his ability to share his knowledge with others and his commitment to ongoing learning ([Repairing Searching](/rs.html)).

Hiring Daniel would mean gaining a team member with a deep understanding of search practices, a commitment to improving these practices, and the ability to effectively communicate his findings.

In [12]:
text

'Daniel S. Griffin is a highly qualified professional with a Ph.D. in Information Science from the University of California, Berkeley. He has extensive experience working with various organizations on topics such as web search, algorithmic fairness, and cybersecurity. His expertise lies in understanding users and identifying and communicating findings effectively, which makes him a valuable asset for any organization ([About](/about.html)).\n\nDaniel is a skilled qualitative researcher who has focused on web search tools and practices. His background and expertise enable him to contribute significantly to the understanding and improvement of search tools and practices, particularly in generative search and search-like tools. He is actively seeking opportunities in the industry to leverage his skills and knowledge ([Hire me.](/hire-me.html)).\n\nDaniel has also demonstrated his knowledge through his research. He has explored the solitary and secretive nature of web searching among data 