-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path22.ai-dml.sql
More file actions
371 lines (331 loc) · 14.5 KB
/
22.ai-dml.sql
File metadata and controls
371 lines (331 loc) · 14.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
-- =====================================================
-- Basic Semantic Search with Cosine Distance
-- Demonstrates: Finding the closest movies to a search phrase embedding
-- NOTE: <=> is cosine distance operator
-- =====================================================
WITH query_embedding AS (SELECT omdb.get_embedding('May the force be with you') AS emb)
SELECT id,
name,
description,
movie_embedding <=> query_embedding.emb AS cosine_distance
FROM omdb.movies,
query_embedding
WHERE movie_embedding IS NOT NULL
ORDER BY movie_embedding <=> query_embedding.emb
LIMIT 10;
-- =====================================================
-- Semantic Search Returning Similarity Score
-- Demonstrates: Converting cosine distance into a more readable similarity metric
-- =====================================================
SELECT id,
name,
ROUND((1 - (movie_embedding <=> omdb.get_embedding('May the force be with you')))::numeric, 4) AS similarity_score
FROM omdb.movies
WHERE movie_embedding IS NOT NULL
ORDER BY movie_embedding <=> omdb.get_embedding('May the force be with you')
LIMIT 10;
-- =====================================================
-- Semantic Search with Release Date Filter
-- Demonstrates: Combining vector similarity with structured filtering
-- =====================================================
SELECT id,
name,
release_date,
vote_average,
movie_embedding <=> omdb.get_embedding('May the force be with you') AS cosine_distance
FROM omdb.movies
WHERE movie_embedding IS NOT NULL
AND release_date >= DATE '2010-01-01'
ORDER BY movie_embedding <=> omdb.get_embedding('May the force be with you')
LIMIT 10;
-- =====================================================
-- Semantic Search with Rating Threshold
-- Demonstrates: Retrieving semantically relevant but also highly rated movies
-- =====================================================
SELECT id,
name,
vote_average,
votes_count,
movie_embedding <=> omdb.get_embedding('May the force be with you') AS cosine_distance
FROM omdb.movies
WHERE movie_embedding IS NOT NULL
AND vote_average >= 7.5
AND votes_count >= 1000
ORDER BY movie_embedding <=> omdb.get_embedding('May the force be with you')
LIMIT 10;
-- =====================================================
-- Semantic Search with Revenue Filter
-- Demonstrates: Mixing AI search with business-oriented structured criteria
-- =====================================================
SELECT id,
name,
revenue,
budget,
movie_embedding <=> omdb.get_embedding('May the force be with you') AS cosine_distance
FROM omdb.movies
WHERE movie_embedding IS NOT NULL
AND revenue > 100000000
ORDER BY movie_embedding <=> omdb.get_embedding('May the force be with you')
LIMIT 10;
-- =====================================================
-- Top-K Nearest Neighbor Search
-- Demonstrates: Canonical retrieval pattern for recommendation/search systems
-- =====================================================
SELECT id,
name,
movie_embedding <=> omdb.get_embedding('May the force be with you') AS cosine_distance
FROM omdb.movies
WHERE movie_embedding IS NOT NULL
ORDER BY movie_embedding <=> omdb.get_embedding('May the force be with you')
LIMIT 5;
-- =====================================================
-- Semantic Search Using a CTE for Reusability
-- Demonstrates: Computing the query embedding once and reusing it
-- =====================================================
WITH query_embedding AS (SELECT omdb.get_embedding('A pirate captain who sails the seven seas in search of treasure') AS emb)
SELECT m.id,
m.name,
m.vote_average,
m.movie_embedding <=> q.emb AS cosine_distance
FROM omdb.movies m
CROSS JOIN query_embedding q
WHERE m.movie_embedding IS NOT NULL
ORDER BY m.movie_embedding <=> q.emb
LIMIT 10;
-- =====================================================
-- Full-Text Search Only
-- Demonstrates: Lexical keyword search without vectors
-- =====================================================
SELECT id,
name,
ts_rank(search_vector,
websearch_to_tsquery('english', '"A pirate captain who sails the seven seas in search of treasure"')) AS text_rank
FROM omdb.movies
WHERE search_vector @@ websearch_to_tsquery('english', '"A pirate captain who sails the seven seas in search of treasure"')
ORDER BY text_rank DESC
LIMIT 10;
-- =====================================================
-- Hybrid Search with Simple Combined Ranking
-- Demonstrates: Combining semantic similarity and full-text relevance
-- =====================================================
WITH query_data AS (SELECT omdb.get_embedding('May the force be with you') AS emb,
websearch_to_tsquery('english', '"May the force be with you"') AS tsq)
SELECT m.id,
m.name,
ROUND((1 - (m.movie_embedding <=> q.emb))::numeric, 4) AS semantic_score,
ROUND(ts_rank(m.search_vector, q.tsq)::numeric, 4) AS lexical_score,
ROUND(((1 - (m.movie_embedding <=> q.emb)) * 0.7 + ts_rank(m.search_vector, q.tsq) * 0.3)::numeric, 4) AS hybrid_score
FROM omdb.movies m
CROSS JOIN query_data q
WHERE m.movie_embedding IS NOT NULL
AND m.search_vector @@ q.tsq
ORDER BY hybrid_score DESC
LIMIT 10;
-- =====================================================
-- Hybrid Search with Broader Candidate Set
-- Demonstrates: Allowing either semantic closeness or text match before final ranking
-- =====================================================
WITH query_data AS (SELECT omdb.get_embedding('May the force be with you') AS emb,
websearch_to_tsquery('english', '"May the force be with you"') AS tsq)
SELECT m.id,
m.name,
ROUND((1 - (m.movie_embedding <=> q.emb))::numeric, 4) AS semantic_score,
ROUND(COALESCE(ts_rank(m.search_vector, q.tsq), 0)::numeric, 4) AS lexical_score,
ROUND(((1 - (m.movie_embedding <=> q.emb)) * 0.8 + COALESCE(ts_rank(m.search_vector, q.tsq), 0) * 0.2)::numeric, 4) AS hybrid_score
FROM omdb.movies m
CROSS JOIN query_data q
WHERE m.movie_embedding IS NOT NULL
AND (
m.search_vector @@ q.tsq
OR (m.movie_embedding <=> q.emb) < 0.45
)
ORDER BY hybrid_score DESC
LIMIT 15;
-- =====================================================
-- Hybrid Search with Business Ranking Signals
-- Demonstrates: Mixing AI relevance with rating and popularity signals
-- =====================================================
WITH query_data AS (SELECT omdb.get_embedding('May the force be with you') AS emb)
SELECT m.id,
m.name,
m.vote_average,
m.votes_count,
ROUND((1 - (m.movie_embedding <=> q.emb))::numeric, 4) AS semantic_score,
ROUND((
(1 - (m.movie_embedding <=> q.emb)) * 0.6 +
COALESCE(m.vote_average / 10.0, 0) * 0.25 +
LEAST(COALESCE(m.votes_count, 0), 100000)::numeric / 100000 * 0.15
)::numeric, 4) AS final_score
FROM omdb.movies m
CROSS JOIN query_data q
WHERE m.movie_embedding IS NOT NULL
ORDER BY final_score DESC
LIMIT 10;
-- =====================================================
-- Semantic Search with Aggregated Analytics
-- Demonstrates: Summarizing the top semantically similar result set
-- =====================================================
WITH top_matches AS (SELECT id,
name,
release_date,
runtime,
budget,
revenue,
vote_average,
votes_count,
movie_embedding <=> omdb.get_embedding('May the force be with you') AS cosine_distance
FROM omdb.movies
WHERE movie_embedding IS NOT NULL
ORDER BY movie_embedding <=> omdb.get_embedding('May the force be with you')
LIMIT 20)
SELECT COUNT(*) AS matched_movies,
AVG(runtime) AS avg_runtime,
AVG(vote_average) AS avg_vote_average,
AVG(budget) AS avg_budget,
AVG(revenue) AS avg_revenue,
MIN(release_date) AS oldest_release_date,
MAX(release_date) AS newest_release_date
FROM top_matches;
-- =====================================================
-- Group Semantic Matches by Release Decade
-- Demonstrates: Analytical reporting on retrieved semantic neighborhoods
-- =====================================================
WITH top_matches AS (SELECT id,
name,
release_date,
movie_embedding <=> omdb.get_embedding('May the force be with you') AS cosine_distance
FROM omdb.movies
WHERE movie_embedding IS NOT NULL
AND release_date IS NOT NULL
ORDER BY movie_embedding <=> omdb.get_embedding('May the force be with you')
LIMIT 50)
SELECT (EXTRACT(YEAR FROM release_date)::int / 10) * 10 AS release_decade,
COUNT(*) AS movies_in_decade
FROM top_matches
GROUP BY release_decade
ORDER BY release_decade;
-- =====================================================
-- Find Movies Similar to Another Movie
-- Demonstrates: Movie-to-movie recommendation using an existing movie embedding
-- =====================================================
SELECT m2.id,
m2.name,
m2.movie_embedding <=> m1.movie_embedding AS cosine_distance
FROM omdb.movies m1
JOIN omdb.movies m2
ON m1.id <> m2.id
WHERE m1.name = 'Inception'
AND m1.movie_embedding IS NOT NULL
AND m2.movie_embedding IS NOT NULL
ORDER BY m2.movie_embedding <=> m1.movie_embedding
LIMIT 10;
-- =====================================================
-- Find Movies Similar to Another Movie by ID
-- Demonstrates: More stable recommendation query using primary key lookup
-- =====================================================
SELECT m2.id,
m2.name,
ROUND((1 - (m2.movie_embedding <=> m1.movie_embedding))::numeric, 4) AS similarity_score
FROM omdb.movies m1
JOIN omdb.movies m2
ON m1.id <> m2.id
WHERE m1.id = 100
AND m1.movie_embedding IS NOT NULL
AND m2.movie_embedding IS NOT NULL
ORDER BY m2.movie_embedding <=> m1.movie_embedding
LIMIT 10;
-- =====================================================
-- Phrase-to-Phrase Similarity Search
-- Demonstrates: Finding semantically related user queries in the dictionary
-- =====================================================
SELECT d2.phrase,
d2.phrase_embedding <=> d1.phrase_embedding AS cosine_distance
FROM omdb.phrases_dictionary d1
JOIN omdb.phrases_dictionary d2
ON LOWER(d1.phrase) <> LOWER(d2.phrase)
WHERE LOWER(d1.phrase) = LOWER('May the force be with you')
AND d1.phrase_embedding IS NOT NULL
AND d2.phrase_embedding IS NOT NULL
ORDER BY d2.phrase_embedding <=> d1.phrase_embedding
LIMIT 10;
-- =====================================================
-- Use Dictionary Phrases to Retrieve Movies
-- Demonstrates: Running semantic search for multiple stored phrases
-- =====================================================
SELECT d.phrase,
m.id,
m.name,
m.movie_embedding <=> d.phrase_embedding AS cosine_distance
FROM omdb.phrases_dictionary d
CROSS JOIN LATERAL (
SELECT id,
name,
movie_embedding
FROM omdb.movies
WHERE movie_embedding IS NOT NULL
ORDER BY movie_embedding <=> d.phrase_embedding
LIMIT 3
) m
WHERE d.phrase_embedding IS NOT NULL
ORDER BY d.phrase, cosine_distance;
-- =====================================================
-- Semantic Search with Window Function Ranking
-- Demonstrates: Ranking neighbors and keeping ordering metadata
-- =====================================================
WITH query_embedding AS (SELECT omdb.get_embedding('May the force be with you') AS emb),
ranked_matches AS (SELECT m.id,
m.name,
m.release_date,
m.movie_embedding <=> q.emb AS cosine_distance,
ROW_NUMBER() OVER (ORDER BY m.movie_embedding <=> q.emb) AS semantic_rank
FROM omdb.movies m
CROSS JOIN query_embedding q
WHERE m.movie_embedding IS NOT NULL)
SELECT semantic_rank,
id,
name,
release_date,
cosine_distance
FROM ranked_matches
WHERE semantic_rank <= 10
ORDER BY semantic_rank;
-- =====================================================
-- Semantic Search with Threshold
-- Demonstrates: Returning only sufficiently similar items
-- =====================================================
SELECT id,
name,
movie_embedding <=> omdb.get_embedding('May the force be with you') AS cosine_distance
FROM omdb.movies
WHERE movie_embedding IS NOT NULL
AND (movie_embedding <=> omdb.get_embedding('May the force be with you')) < 0.35
ORDER BY movie_embedding <=> omdb.get_embedding('May the force be with you')
LIMIT 20;
-- =====================================================
-- Inspect Query Execution Plan for Semantic Search
-- Demonstrates: Verifying whether PostgreSQL uses the pgvector index
-- =====================================================
EXPLAIN ANALYZE
SELECT id,
name,
movie_embedding <=> omdb.get_embedding('May the force be with you') AS cosine_distance
FROM omdb.movies
WHERE movie_embedding IS NOT NULL
ORDER BY movie_embedding <=> omdb.get_embedding('May the force be with you')
LIMIT 10;
-- =====================================================
-- Inspect Query Execution Plan for Hybrid Search
-- Demonstrates: Understanding execution behavior for combined AI + text search
-- =====================================================
EXPLAIN ANALYZE
WITH query_data AS (SELECT omdb.get_embedding('May the force be with you') AS emb,
websearch_to_tsquery('english', '"May the force be with you"') AS tsq)
SELECT m.id,
m.name,
((1 - (m.movie_embedding <=> q.emb)) * 0.7 + ts_rank(m.search_vector, q.tsq) * 0.3) AS hybrid_score
FROM omdb.movies m
CROSS JOIN query_data q
WHERE m.movie_embedding IS NOT NULL
AND m.search_vector @@ q.tsq
ORDER BY hybrid_score DESC
LIMIT 10;