Build a product inventory with Trieve
In this guide we are going to use the Amazon Berkeley objects dataset (https://amazon-berkeley-objects.s3.amazonaws.com/index.html) to build an inventory to search with Trieve
Cloning the example repository
git clone https://github.com/devflowinc/examples
cd examples/amazon-berkeley-dataset
Getting the data
wget https://amazon-berkeley-objects.s3.amazonaws.com/archives/abo-listings.tar
tar xvf abo-listings.tar
gzip -d listings/metadata/listings_*.json.gz
This should extract multiple lines that have
Image CSV Data
wget https://amazon-berkeley-objects.s3.amazonaws.com/images/metadata/images.csv.gz
gzip -d images.csv.gz
The images.csv
is formatted as the following.
image_id,height,width,path
010-mllS7JL,106,106,14/14fe8812.jpg
01dkn0Gyx0L,122,122,da/daab0cad.jpg
01sUPg0387L,111,111,d2/d2daaae9.jpg
Running the ingest script
To run the ingest script you first need to add or modify a .env file with
TRIEVE_API_KEY="tr-********************************"
TRIEVE_DATASET_ID="********-****-****-****-************"
Then run the ingest script
A breakdown of the script is below
Types
The first step is to define all the types
interface Chunk {
chunk_html: string;
link: string;
tracking_id: string;
tag_set: string[];
metadata: {
[key: string]: string;
};
time_stamp: string;
upsert_by_tracking_id: boolean;
}
interface LanguageTaggedValue {
language_tag: string;
value: string;
}
interface ItemDimension {
unit: string;
value: number;
}
interface ItemDimensions {
height: ItemDimension;
length: ItemDimension;
width: ItemDimension;
}
interface Node {
node_id: number;
node_name: string;
}
interface Item {
brand: LanguageTaggedValue[];
bullet_point: LanguageTaggedValue[];
color: LanguageTaggedValue[];
item_id: string;
price?: number;
image_url?: string;
item_name: LanguageTaggedValue[];
model_name: LanguageTaggedValue[];
model_number: { value: string }[];
model_year: { value: number }[];
product_type: { value: string }[];
style: LanguageTaggedValue[];
main_image_id: string;
other_image_id: string[];
item_keywords: LanguageTaggedValue[];
country: string;
marketplace: string;
domain_name: string;
node: Node[];
item_dimensions?: ItemDimensions;
}
Image Hashmap
Since images and json bodies are in separate files, we parse the images into a hashmap
function parseCSV(csvData: string): Map<any, any> {
const lines = csvData.split("\n");
const hashmap = new Map();
for (let i = 1; i < lines.length; i++) {
const values = lines[i].trim().split(",");
let imageId: string | null = null;
let path: string | null = null;
if (values[0]) {
imageId = values[0].trim();
}
if (values[3]) {
path = values[3].trim();
}
if (imageId != null && path != null) {
hashmap.set(imageId, path);
}
}
return hashmap;
}
const imageFilePath = "./images.csv";
const csvImageData = await Bun.file(imageFilePath).text();
let imageHashMap = parseCSV(csvImageData);
if (imageHashMap == null) {
console.log("Failed to load image csv data");
}
Processing a Singular Chunk
For each row of json we construct a Chunk
object, each of which can be pushed into Trieve separately or in bulk. We use the imageHashMap
object constructed from before to get the image_url
for each.
function processLine(line: string) {
const item: Item = JSON.parse(line);
let image_url;
let imageId = item.main_image_id == "" ? null : item.main_image_id;
let imagePath = imageHashMap.get(imageId);
let price = [10, 25, 50, 100, 500, 1000][Math.floor(Math.random() * 6)];
if (imagePath != null) {
image_url = `https://amazon-berkeley-objects.s3.amazonaws.com/images/small/${imagePath}`;
}
let searchableString = "";
const addField = (
field: string | undefined,
prefix: string = "",
postfix: string = "\n"
) => {
if (field) {
searchableString += `${prefix}${field}${postfix}`;
}
};
addField(price?.toString(), "Price: $");
addField(item.brand?.[0]?.value, "Brand: ");
addField(item.item_name?.[0]?.value, "Product Name: ");
item.bullet_point?.forEach((bp) => addField(bp.value, "", ";"));
addField(item.color?.[0]?.value, "Color: ");
addField(item.model_name?.[0]?.value, "Model Name: ");
addField(item.model_number?.[0]?.value, "Model Number: ");
if (item.model_year?.[0]?.value !== undefined) {
addField(item.model_year[0].value.toString(), "Model Year: ");
}
addField(item.product_type?.[0]?.value, "Product Type: ");
addField(item.style?.[0]?.value, "Style: ");
item.item_keywords?.forEach((kw) => addField(kw.value, "", ";"));
addField(item.country, "Country: ");
addField(item.marketplace, "Marketplace: ");
addField(item.domain_name, "Domain: ");
const metadata: Partial<Item> = { ...item };
metadata.image_url = image_url;
metadata.price = price;
const chunkData: Chunk = {
chunk_html: searchableString.trim(),
link: `https://${item.domain_name}/dp/${item.item_id}`,
tracking_id: item.item_id,
tag_set: item.item_keywords?.map((kw) => kw.value),
metadata,
image_urls: [ image_url ?? "" ],
upsert_by_tracking_id: true,
};
return chunkData;
}
Iterating and sending chunks to Trieve
The last step is we iterate through all the json files, process each line into a Chunk object. We use the /chunk
endpoint to create chunks, in this case we are sending chunks in a batch size of 120 which is the max for this route.
const directoryPath = "./listings/metadata/";
const files = await readdir(directoryPath)
for (const file of files) {
const fullPath = resolve(directoryPath + file);
const fileStream = createReadStream(fullPath);
const rl = readline.createInterface({
input: fileStream,
crlfDelay: Infinity,
});
const items: Chunk[] = [];
for await (const line of rl) {
try {
const chunkData: Chunk = processLine(line);
const options = {
method: "POST",
headers: {
"TR-Dataset": trieveDatasetId,
Authorization: trieveApiKey,
"Content-Type": "application/json",
},
body: JSON.stringify(chunk),
};
await fetch("https://api.trieve.ai/api/chunk", options);
} catch (error) {
console.error("Error parsing JSON from line:", error);
}
}
const batchSize = 120;
const chunkedItems: Chunk[][] = [];
for (let i = 0; i < items.length; i += batchSize) {
const chunk = items.slice(i, i + batchSize);
chunkedItems.push(chunk);
}
for (const chunk of chunkedItems) {
try {
console.log(`Creating chunk`);
await chunkApi.createChunk(trieveDatasetId, chunk);
} catch (error) {
console.error(`Failed to create chunk`);
console.error(error);
}
}
}
Checkout the api reference for the upload API to view all of the parameters that can be passed in with the chunk.