by perplexity-ai
Open source · 93k downloads · 33 likes
The *pplx embed context v1 4b* model is an advanced text embedding solution designed for large-scale semantic search tasks. Unlike traditional models, it stands out for its ability to incorporate the surrounding context of documents, making it particularly effective in Retrieval-Augmented Generation (RAG) systems where contextual coherence is critical. It generates dense, non-normalized embeddings optimized for comparison via cosine similarity, and avoids the use of prefixed instructions, thereby simplifying indexing pipelines and reducing the risk of inconsistent results. Ideal for applications such as document search, similarity analysis, or knowledge base enrichment, it positions itself as a robust and high-performance alternative to traditional models.
pplx-embed-v1: Diffusion-Pretrained Dense and Contextual Embeddings
pplx-embed-v1 and pplx-embed-context-v1 are state-of-the-art text embedding models optimized for real-world, web-scale retrieval tasks.
pplx-embed-v1 for independent text embedding (queries, documents, semantic search)pplx-embed-context-v1 for document chunks in RAG systems where surrounding context matters[!IMPORTANT]
pplx-embed-v1andpplx-embed-context-v1natively produce unnormalized int8-quantized embeddings. Ensure that you compare them via cosine similarity.

| Model | Dimensions | Context | MRL | Quantization | Instruction | Pooling |
|---|---|---|---|---|---|---|
pplx-embed-v1-0.6B | 1024 | 32K | Yes | INT8/BINARY | No | Mean |
pplx-embed-v1-4B | 2560 | 32K | Yes | INT8/BINARY | No | Mean |
pplx-embed-context-v1-0.6B | 1024 | 32K | Yes | INT8/BINARY | No | Mean |
pplx-embed-context-v1-4B | 2560 | 32K | Yes | INT8/BINARY | No | Mean |
All models are built on diffusion continued pre-trained Qwen3 at Perplexity AI.
Many modern embedding models rely on instruction tuning, where users prepend an instruction string to the text being embedded. This can yield a 2%-3% lift on benchmarks, but it also introduces prompt-selection overhead and can make indexing pipelines brittle (small instruction changes can shift embedding space). We deliberately avoid this requirement: you can embed the text you want to index directly, without having to choose or maintain an instruction prefix.
curl -X POST https://api.perplexity.ai/v1/contextualizedembeddings \
-H "Authorization: Bearer YOUR_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"input": [
[
"Curiosity begins in childhood with endless questions about the world.",
"As we grow, curiosity drives us to explore new ideas and challenge assumptions.",
"Scientific breakthroughs often start with a simple curious question."
],
[
"The curiosity rover explores Mars, searching for signs of ancient life.",
"Each discovery on Mars sparks new questions about our place in the universe."
]
],
"model": "pplx-embed-context-v1-4b"
}'
from transformers import AutoModel
model_ctx = AutoModel.from_pretrained(
"perplexity-ai/pplx-embed-context-v1-4B",
trust_remote_code=True
)
doc_chunks = [
[
"Curiosity begins in childhood with endless questions about the world.",
"As we grow, curiosity drives us to explore new ideas.",
"Scientific breakthroughs often start with a curious question."
],
[
"The curiosity rover explores Mars searching for ancient life.",
"Each discovery on Mars sparks new questions about the universe."
]
]
# Returns list of numpy arrays (one per document)
# embeddings[0].shape = (3, 1024), embeddings[1].shape = (2, 1024)
embeddings = model_ctx.encode(doc_chunks)
import onnxruntime as ort
from transformers import AutoTokenizer
import numpy as np
import torch
def quantize_int8_tanh(x):
normalized = torch.tanh(x)
rounded = torch.round(normalized * 127)
clamped = torch.clamp(rounded, -128, 127)
return clamped
def quantize_binary(x):
return torch.where(x >= 0, 1.0, -1.0)
def mean_pooling(
token_embeddings: torch.Tensor, attention_mask: torch.Tensor
) -> torch.Tensor:
"""Apply mean pooling to token embeddings."""
input_mask_expanded = (
attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
)
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
input_mask_expanded.sum(1), min=1e-9
)
def extract_chunks_from_concatenated(
input_ids: torch.Tensor,
token_embeddings: torch.Tensor,
attention_mask: torch.Tensor,
tokenizer,
) -> list[list[torch.Tensor]]:
"""
Extract individual chunk embeddings from concatenated sequence using late chunking.
This method splits concatenated sequences like "[chunk1][SEP][chunk2][SEP]..."
back into individual chunk embeddings by finding SEP token positions.
Args:
input_ids: Token IDs (batch_size, seq_len)
token_embeddings: Token embeddings (batch_size, seq_len, hidden_dim)
attention_mask: Attention mask (batch_size, seq_len)
Returns:
list[list[torch.Tensor]]: List of documents, each containing list of chunk embeddings
Note:
The sep_token_id is retrieved tokenizer.sep_token_id.
Common values: pplx-embed-1=151643, BERT=102, varies by tokenizer.
"""
sep_token_id = tokenizer.sep_token_id
batch_size = input_ids.shape[0]
all_doc_chunks = []
for batch_idx in range(batch_size):
# non-pad sep tokens
valid_positions = attention_mask[batch_idx].bool()
sep_positions = (
(input_ids[batch_idx] == sep_token_id) & valid_positions
).nonzero(as_tuple=True)[0]
chunk_embeddings = []
start_pos = 0
for sep_pos in sep_positions:
chunk_tokens = token_embeddings[batch_idx, start_pos:sep_pos]
chunk_mask = attention_mask[batch_idx, start_pos:sep_pos]
chunk_emb = mean_pooling(
chunk_tokens.unsqueeze(0), chunk_mask.unsqueeze(0)
).squeeze(0)
chunk_embeddings.append(chunk_emb)
start_pos = sep_pos + 1
# Handle the last chunk (after the last SEP token)
last_valid_pos = attention_mask[batch_idx].sum().item()
chunk_tokens = token_embeddings[batch_idx, start_pos:last_valid_pos]
chunk_mask = attention_mask[batch_idx, start_pos:last_valid_pos]
if chunk_mask.sum() > 0:
chunk_emb = mean_pooling(
chunk_tokens.unsqueeze(0), chunk_mask.unsqueeze(0)
).squeeze(0)
else:
# Empty chunk - create zero embedding
chunk_emb = torch.zeros(
token_embeddings.shape[-1],
device=token_embeddings.device,
dtype=token_embeddings.dtype,
)
chunk_embeddings.append(chunk_emb)
all_doc_chunks.append(chunk_embeddings)
return all_doc_chunks
hf_path=("perplexity-ai/pplx-embed-context-v1-4b")
onnx_path=("onnx/model.onnx")
tokenizer = AutoTokenizer.from_pretrained(hf_path, trust_remote_code=True)
session = ort.InferenceSession(onnx_path)
texts = [
[
"Curiosity begins in childhood with endless questions about the world.",
"As we grow, curiosity drives us to explore new ideas.",
"Scientific breakthroughs often start with a curious question."
],
[
"The curiosity rover explores Mars searching for ancient life.",
"Each discovery on Mars sparks new questions about the universe."
]
]
doc_strings = [
tokenizer.sep_token.join(chunks) for chunks in texts
]
tokenized = tokenizer(
doc_strings,
padding=True,
truncation=True,
return_tensors="np",
)
onnx_inputs = {
"input_ids": tokenized["input_ids"].astype(np.int64),
"attention_mask": tokenized["attention_mask"].astype(np.int64),
}
# Run inference
onnx_outputs = session.run([out.name for out in session.get_outputs()], onnx_inputs)
# onnx_outputs is a list with one element: [last_hidden_state]
last_hidden_state = onnx_outputs[0]
batch_chunk_embeddings = extract_chunks_from_concatenated(
input_ids=torch.tensor(onnx_inputs["input_ids"]),
token_embeddings=torch.tensor(last_hidden_state),
attention_mask=torch.tensor(onnx_inputs["attention_mask"]),
tokenizer=tokenizer,
)
batch_chunk_embeddings = [
torch.stack([chunk for chunk in doc_chunks], dim=0)
for doc_chunks in batch_chunk_embeddings
]
int8_embeddings = [quantize_int8_tanh(x) for x in batch_chunk_embeddings]
binary_embeddings = [quantize_binary(x) for x in batch_chunk_embeddings]
bits = [np.where(doc.numpy() >= 0, True, False) for doc in binary_embeddings]
packed_embeddings = [np.packbits(b, axis=-1) for b in bits]
For comprehensive technical details and evaluation results, see our paper on arXiv: https://arxiv.org/abs/2602.11151.