This is a file explaning how the logprobs and embeddings are calculated.
Let's assume we have outline points and sentences for rendering.
outline_sents = [
"Introduction.",
"Goal-oriented.",
"Problem solver.",
"Leader.",
"Values relationships."
]
text_sents = [
"Let me introduce you to Jack.",
"He's a driven and ambitious individual with a laser-focused mindset on achieving his goals.",
"With a keen eye for detail, he excels in problem-solving and is always seeking new challenges to test his abilities.",
"Jack is a natural leader, with the ability to inspire and motivate others to perform at their best.",
"Despite his demanding schedule, he always makes time for his family and friends, valuing the importance of maintaining strong relationships."
]
FUNCTIONS
-- EMBEDDINGS --
memory = Memory("./joblib_cache", verbose = 0)
@memory.cache
def get_distances_from_query_list(query_list, texts):
list_emb = [get_embedding(text) for text in texts]
distances = []
for query in query_list:
query_emb = get_embedding(query)
distances.append(distances_from_embeddings(query_emb, list_emb))
return distances
Getting embeddings is fairly simple.
outline_raw = '''
Introduction.
Goal-oriented.
Problem solver.
Leader.
Values relationships.
'''
text_raw = '''
Let me introduce you to Jack.
He's a driven and ambitious individual with a laser-focused mindset on achieving his goals.
With a keen eye for detail, he excels in problem-solving and is always seeking new challenges to test his abilities.
Jack is a natural leader, with the ability to inspire and motivate others to perform at their best.
Despite his demanding schedule, he always makes time for his family and friends, valuing the importance of maintaining strong relationships.
'''
# Tokenizes text into individual words
outline_doc = nlp(outline_raw)
outline_sections = [sentence.text.strip() for sentence in outline_doc.sents]
essay_doc = nlp(text_raw)
essay_sections = [sentence.text.strip() for sentence in essay_doc.sents]
data = get_distances_from_query_list(outline_sections, essay_sections)
data
[[0.2264650657138757, 0.36996475160295084, 0.3575646639671879, 0.35246010007407635, 0.4129912837999572], [0.27857953635370014, 0.21381096218919704, 0.2724693546505097, 0.28576855853523286, 0.356055233516404], [0.2639245561993384, 0.2760368252369235, 0.23257948277585383, 0.29355789145347466, 0.3503152156442455], [0.27826405756339845, 0.31222124374750504, 0.3298975021518694, 0.2736006624945082, 0.3781515680793055], [0.31181189882958193, 0.33325379115170894, 0.3197764966240627, 0.32030896186849145, 0.2730285304568648]]
-- LOGPROBS --
The following function is a helper function that takes a list and an element of interest, and returns a list of possible combinations with an element of interesting being in unique positions.
def all_unique_pos(lst, element):
res = []
for i in range(len(lst)):
new_list = lst[:]
new_list.pop(lst.index(element))
res.append(new_list[:i] + [element] + new_list[i:])
return res
all_unique_pos([1,2,3], 2)
[[2, 1, 3], [1, 2, 3], [1, 3, 2]]
The following function is a helper function that takes a question prompt, and list of sentences of the text, and returns a single string containing the question and text. Format: Question Outline with bullet points in front of each sentence and new line at the end Another new line at the end of the prompt
def create_prompt(outline_sections):
# join the paragraphs into a single string, separated by newlines
essay = '\n'.join(outline_sections)
# split the essay into a list of sentences
sentences = essay.split('. ')
# join the sentences back into a single string, with each sentence on a new line and
# prepended with a bullet point
outline = 'Write a short essay given this outline:\n'
for i, sentence in enumerate(sentences):
if sentence:
outline += f'• {sentence.strip()}.'
if i < len(sentences) - 1:
outline += '\n'
outline += '\n' # add a newline character at the end
return outline
test_prompt = create_prompt(outline_sents)
The following function concatenates the prompt with the text, to create a full text that will be sent as a request. Returns a single string.
def full_request_template(question_w_outline, sents):
return question_w_outline + ' '.join(sents)
test_request_query = full_request_template(test_prompt, text_sents)
test_request_query
"Write a short essay given this outline:\n• Introduction.\nGoal-oriented.\nProblem solver.\nLeader.\nValues relationships..\nLet me introduce you to Jack. He's a driven and ambitious individual with a laser-focused mindset on achieving his goals. With a keen eye for detail, he excels in problem-solving and is always seeking new challenges to test his abilities. Jack is a natural leader, with the ability to inspire and motivate others to perform at their best. Despite his demanding schedule, he always makes time for his family and friends, valuing the importance of maintaining strong relationships."
The following function makes a request to OpenAI to get logprobs.
@memory.cache
def get_response(full_request):
response = openai.Completion.create(
model="text-davinci-003",
prompt = full_request,
temperature=0.7,
max_tokens=256,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
logprobs=10,
echo=True
)
return response
Given a combination list where ONE SENTENCE OF INTEREST is present in all unique places, get the response from openai for that list.
def get_all_responses(sents, question_w_outline):
response = []
for x in sents:
response.append(get_response(full_request_template(question_w_outline, x)))
return response
This function is for getting logprobs for every possible combination / order of the sentences. Takes the text sentences, the prompt as input, and returns a list of responses.
def all_the_log_probs(sentences, question_w_outline):
res = []
for x in sentences:
res.append(get_all_responses(all_unique_pos(sentences, x), question_w_outline))
return res
The following function is a helper function that calculates the sum of logprobs for the text part. Because we make a request containing the question, spaces and new lines, it takes that fact into account and calculates the sum by counting text offsets. Returns a single number indicating logprobs for the text part.
def compute_log_probs(question_w_outline, original_text, response):
start_point = len(question_w_outline)
start_index = response.choices[0].logprobs.text_offset.index(start_point)
len_original_text = len(original_text)
end_point = start_point + len_original_text - 1
end_index = min(range(len(response.choices[0].logprobs.text_offset)), key=lambda i: abs(response.choices[0].logprobs.text_offset[i] - end_point))
total = 0
for x in range(start_index, end_index + 1):
total = total + response.choices[0].logprobs.token_logprobs[x]
return total
Final function that would calculate all logprobs for all the combinations of the sentences.
def allLogProbs(res, sentences, question_w_outline):
all_logprobs = []
for i in range(len(sentences)):
combinations = all_unique_pos(sentences, sentences[i])
logprobs = []
for j in range(len(combinations)):
original_text = ' '.join(combinations[j])
logprobs.append(compute_log_probs(question_w_outline, original_text, res[i][j]))
# Get the highest indeces here
# highest_indeces = getHighestIndexes(logprobs, 3)
all_logprobs.append(logprobs)
return all_logprobs
For the logprobs, we would need to do the following to get one big result that the front-end could use to render the results.
question_w_outline = create_prompt(outline_sents)
res = all_the_log_probs(text_sents, question_w_outline)
final = allLogProbs(res, text_sents, question_w_outline)
final
[[-146.409018288316, -174.6094012188949, -177.54884178347095, -184.923083856484, -171.44304057892901], [-174.6094012188949, -146.409018288316, -160.52425362044303, -164.24308980543395, -162.279895625332], [-176.6957944925011, -160.52425362044303, -146.409018288316, -157.42897514411194, -160.48215467138195], [-173.57262323202693, -159.56882654866195, -157.42897514411194, -146.409018288316, -153.003944198414], [-179.64452757485998, -166.5090151766819, -155.39103498734303, -153.003944198414, -146.409018288316]]