diff --git a/README.md b/README.md index 25722c9..4a79ebd 100644 --- a/README.md +++ b/README.md @@ -1,70 +1,72 @@ -# PHPVector adapter for Neuron AI framework +# neuron-core/php-vector -This is the [PHPVector](https://github.com/ezimuel/PHPVector) adapter for the [Neuron AI framework](https://neuron-ai.dev/). +PHPVector adapter for the [Neuron AI](https://neuron-ai.dev) framework. Implements +`NeuronAI\RAG\VectorStore\VectorStoreInterface` on top of `ezimuel/phpvector`. -## Install +## Installation -``` +```bash composer require neuron-core/php-vector ``` -## Use in RAG +## Usage ```php use NeuronAI\PHPVector\PHPVector; +use PHPVector\VectorDatabase; + +// Persistent database: pass a path to enable on-disk storage. +$store = new PHPVector( + database: new VectorDatabase(path: '/var/data/mydb'), + topK: 5, +); +``` + +Inside a Neuron RAG class: -class MyRAG extends RAG +```php +protected function vectorStore(): VectorStoreInterface { - ... - - protected function vectorStore(): VectorStoreInterface - { - return new PHPVector( - database: new VectorDatabase(path: '/var/data/mydb'), - topK: 5 - ); - } + return new PHPVector( + database: new VectorDatabase(path: '/var/data/mydb'), + topK: 5, + ); } ``` -## Use in Retrieval components +## Persistence + +PHPVector separates document storage from index storage: + +- `new VectorDatabase(path: '...')` creates (or targets) a database directory. +- `VectorDatabase::open('...')` loads an existing database from disk. +- `addDocument()` writes the document file to disk on each call (asynchronously via `pcntl_fork` when available, otherwise synchronously). +- `save()` persists the HNSW + BM25 index and finalizes deletions. + +By default this adapter auto-saves after every mutation (`addDocument`, `addDocuments`, +`deleteBy`), batched to a single `save()` per call, so persistence "just works". Disable it +to manage `save()` yourself: ```php -use NeuronAI\PHPVector\PHPVector; +$store = new PHPVector(database: $db, autoSave: false); +// ... many addDocument() calls ... +$db->save(); +``` -class MyAgent extends Agent -{ - ... - - protected function tools(): array - { - return [ - RetrievalTool::make( - new SimilarityRetrieval( - $this->vectorStore(), - $this->embeddings() - ) - ), - ]; - } - - protected function vectorStore(): VectorStoreInterface - { - return new PHPVector( - database: new VectorDatabase(path: '/var/data/mydb'), - topK: 5 - ); - } - - protected function embeddings(): EmbeddingsProviderInterface - { - return new OllamaEmbeddingsProvider( - model: 'OLLAMA_EMBEDDINGS_MODEL' - ); - } -} +Auto-save is skipped for in-memory databases (no path), so it never throws. + +## Deletion + +`deleteBy()` removes documents by Neuron's `sourceType` / `sourceName`, which this adapter +stores as PHPVector metadata: + +```php +$store->deleteBy('pdf'); // all documents from sourceType "pdf" +$store->deleteBy('pdf', 'manual.pdf'); // only that exact source ``` -## Official documentation +## Requirements -**[Go to the official documentation](https://neuron.inspector.dev/)** +- PHP 8.1+ +- `ezimuel/phpvector` ^0.3.0 +- `neuron-core/neuron-ai` ^3.0 diff --git a/composer.json b/composer.json index f204d13..af04d0a 100644 --- a/composer.json +++ b/composer.json @@ -1,6 +1,6 @@ { "name": "neuron-core/php-vector", - "description": "Conversational Data Collection.", + "description": "PHPVector adapter for the Neuron AI framework.", "minimum-stability": "stable", "authors": [ { @@ -11,7 +11,7 @@ "license": "MIT", "require": { "php": "^8.1", - "ezimuel/phpvector": "^0.2.0", + "ezimuel/phpvector": "^0.3.0", "neuron-core/neuron-ai": "^3.0" }, "require-dev": { diff --git a/src/PHPVector.php b/src/PHPVector.php index f75132b..87991ff 100644 --- a/src/PHPVector.php +++ b/src/PHPVector.php @@ -4,11 +4,11 @@ namespace NeuronAI\PHPVector; -use NeuronAI\Exceptions\VectorStoreException; use NeuronAI\RAG\Document as NeuronDocument; use NeuronAI\RAG\VectorStore\VectorStoreInterface; use NeuronAI\StaticConstructor; use PHPVector\Document; +use PHPVector\Metadata\MetadataFilter; use PHPVector\SearchResult; use PHPVector\VectorDatabase; @@ -18,22 +18,20 @@ class PHPVector implements VectorStoreInterface { use StaticConstructor; + private const SOURCE_TYPE_KEY = 'sourceType'; + private const SOURCE_NAME_KEY = 'sourceName'; + public function __construct( protected VectorDatabase $database, protected int $topK = 5, + protected bool $autoSave = true, ) { } public function addDocument(NeuronDocument $document): VectorStoreInterface { - $this->database->addDocument( - new Document( - id: $document->id, - vector: $document->embedding, - text: $document->content, - metadata: $document->metadata, - ) - ); + $this->write($document); + $this->persist(); return $this; } @@ -44,27 +42,67 @@ public function addDocument(NeuronDocument $document): VectorStoreInterface public function addDocuments(array $documents): VectorStoreInterface { foreach ($documents as $document) { - $this->addDocument($document); + $this->write($document); } + $this->persist(); return $this; } /** - * @throws VectorStoreException + * Persist a Neuron document into PHPVector. + * + * Neuron's `sourceType`/`sourceName` are top-level Document properties, but + * PHPVector only stores `metadata`. They are folded into metadata under the + * reserved keys so `deleteBy()` can filter on them; `similaritySearch()` + * restores them and strips the reserved keys back out. */ + private function write(NeuronDocument $document): void + { + $this->database->addDocument( + new Document( + id: $document->id, + vector: $document->embedding, + text: $document->content, + metadata: [ + ...$document->metadata, + self::SOURCE_TYPE_KEY => $document->sourceType, + self::SOURCE_NAME_KEY => $document->sourceName, + ], + ) + ); + } + + private function persist(): void + { + if ($this->autoSave && $this->database->isPersistent()) { + $this->database->save(); + } + } + public function deleteBy(string $sourceType, ?string $sourceName = null): VectorStoreInterface { - throw new VectorStoreException('Deletion not supported.'); + $filters = [MetadataFilter::eq(self::SOURCE_TYPE_KEY, $sourceType)]; + + if ($sourceName !== null) { + $filters[] = MetadataFilter::eq(self::SOURCE_NAME_KEY, $sourceName); + } + + foreach ($this->database->metadataSearch(filters: $filters) as $result) { + $this->database->deleteDocument($result->document->id); + } + + $this->persist(); + + return $this; } /** - * @throws VectorStoreException + * @deprecated Use deleteBy() instead. */ public function deleteBySource(string $sourceType, string $sourceName): VectorStoreInterface { - $this->deleteBy($sourceType, $sourceName); - return $this; + return $this->deleteBy($sourceType, $sourceName); } /** @@ -79,11 +117,21 @@ public function similaritySearch(array $embedding): iterable ); return array_map(function (SearchResult $result): NeuronDocument { - $document = new NeuronDocument($result->document->text); - $document->id = $result->document->id; - $document->embedding = $result->document->vector; - $document->metadata = $result->document->metadata; + $phpDoc = $result->document; + + $metadata = $phpDoc->metadata; + $sourceType = $metadata[self::SOURCE_TYPE_KEY] ?? 'manual'; + $sourceName = $metadata[self::SOURCE_NAME_KEY] ?? 'manual'; + unset($metadata[self::SOURCE_TYPE_KEY], $metadata[self::SOURCE_NAME_KEY]); + + $document = new NeuronDocument($phpDoc->text); + $document->id = $phpDoc->id; + $document->embedding = $phpDoc->vector; + $document->sourceType = $sourceType; + $document->sourceName = $sourceName; + $document->metadata = $metadata; $document->score = $result->score; + return $document; }, $results); } diff --git a/tests/PHPVectorTest.php b/tests/PHPVectorTest.php index 2ebf73f..234c367 100644 --- a/tests/PHPVectorTest.php +++ b/tests/PHPVectorTest.php @@ -9,6 +9,19 @@ use PHPVector\VectorDatabase; use PHPUnit\Framework\TestCase; +use function array_diff; +use function array_fill; +use function is_array; +use function is_dir; +use function iterator_to_array; +use function mt_getrandmax; +use function mt_rand; +use function rmdir; +use function scandir; +use function sys_get_temp_dir; +use function uniqid; +use function unlink; + class PHPVectorTest extends TestCase { private string $tempDir; @@ -85,7 +98,7 @@ public function testPersistDocumentsAcrossInstances(): void { // Create and persist documents with first instance $database = new VectorDatabase(path: $this->tempDir); - $adapter = new PHPVector($database); + $adapter = new PHPVector($database, autoSave: false); $documents = [ $this->createDocumentWithEmbedding('Persisted document 1'), @@ -139,7 +152,6 @@ public function testSimilaritySearchReturnsResults(): void $results = $adapter->similaritySearch($queryEmbedding); $this->assertNotEmpty($results); - $this->assertIsIterable($results); $resultsArray = is_array($results) ? $results : iterator_to_array($results); $this->assertCount(3, $resultsArray); @@ -238,10 +250,152 @@ public function testAddDocumentsReturnsAdapterInstance(): void $this->assertSame($adapter, $result); } + public function testSourceTypeAndNameRoundTripWithoutLeakingIntoMetadata(): void + { + $database = new VectorDatabase(); + $adapter = new PHPVector($database); + + $document = new NeuronDocument('Round trip content'); + $document->id = 'rt1'; + $document->embedding = $this->createTestEmbedding(); + $document->sourceType = 'pdf'; + $document->sourceName = 'manual.pdf'; + $document->metadata = ['author' => 'jane', 'pages' => 12, 'published' => true]; + + $adapter->addDocument($document); + + $results = $adapter->similaritySearch($document->embedding); + $resultsArray = is_array($results) ? $results : iterator_to_array($results); + $first = $resultsArray[0]; + + self::assertSame('pdf', $first->sourceType); + self::assertSame('manual.pdf', $first->sourceName); + self::assertSame(['author' => 'jane', 'pages' => 12, 'published' => true], $first->metadata); + } + + public function testMutationsPersistWhenAutoSaveEnabled(): void + { + $database = new VectorDatabase(path: $this->tempDir); + $adapter = new PHPVector($database); + + $adapter->addDocuments([ + $this->createDocumentWithEmbedding('Auto 1'), + $this->createDocumentWithEmbedding('Auto 2'), + ]); + + // No explicit save(): auto-save should have persisted the index. + $reopened = VectorDatabase::open($this->tempDir); + self::assertSame(2, $reopened->count()); + } + + public function testAutoSaveDisabledDoesNotPersistUntilManualSave(): void + { + $database = new VectorDatabase(path: $this->tempDir); + $adapter = new PHPVector($database, autoSave: false); + + $adapter->addDocuments([ + $this->createDocumentWithEmbedding('Manual 1'), + $this->createDocumentWithEmbedding('Manual 2'), + ]); + + // Index not yet persisted: meta.json must not exist on disk. + self::assertFileDoesNotExist($this->tempDir . '/meta.json'); + + $database->save(); + $afterSave = VectorDatabase::open($this->tempDir); + self::assertSame(2, $afterSave->count()); + } + + public function testDeleteByRemovesMatchingSourceType(): void + { + $database = new VectorDatabase(); + $adapter = new PHPVector($database); + + $adapter->addDocuments([ + $this->makeSourcedDocument('a', 'pdf', 'one.pdf'), + $this->makeSourcedDocument('b', 'pdf', 'two.pdf'), + $this->makeSourcedDocument('c', 'web', 'site'), + ]); + self::assertSame(3, $database->count()); + + $adapter->deleteBy('pdf'); + + self::assertSame(1, $database->count()); + } + + public function testDeleteByRemovesOnlyExactTypeAndName(): void + { + $database = new VectorDatabase(); + $adapter = new PHPVector($database); + + $adapter->addDocuments([ + $this->makeSourcedDocument('a', 'pdf', 'one.pdf'), + $this->makeSourcedDocument('b', 'pdf', 'two.pdf'), + ]); + + $adapter->deleteBy('pdf', 'one.pdf'); + + self::assertSame(1, $database->count()); + } + + public function testDeleteByWithNoMatchIsNoop(): void + { + $database = new VectorDatabase(); + $adapter = new PHPVector($database); + + $adapter->addDocument($this->makeSourcedDocument('a', 'pdf', 'one.pdf')); + + $result = $adapter->deleteBy('missing'); + + self::assertSame(1, $database->count()); + self::assertSame($adapter, $result); + } + + public function testDeleteBySourceDelegatesToDeleteBy(): void + { + $database = new VectorDatabase(); + $adapter = new PHPVector($database); + + $adapter->addDocuments([ + $this->makeSourcedDocument('a', 'pdf', 'one.pdf'), + $this->makeSourcedDocument('b', 'web', 'site'), + ]); + + $adapter->deleteBySource('pdf', 'one.pdf'); + + self::assertSame(1, $database->count()); + } + + public function testDeleteByPersistsWhenAutoSaveEnabled(): void + { + $database = new VectorDatabase(path: $this->tempDir); + $adapter = new PHPVector($database); + + $adapter->addDocuments([ + $this->makeSourcedDocument('a', 'pdf', 'one.pdf'), + $this->makeSourcedDocument('b', 'web', 'site'), + ]); + + $adapter->deleteBy('pdf'); + + $reopened = VectorDatabase::open($this->tempDir); + self::assertSame(1, $reopened->count()); + } + private function createDocumentWithEmbedding(string $content): NeuronDocument { $document = new NeuronDocument($content); $document->embedding = $this->createTestEmbedding(); return $document; } + + private function makeSourcedDocument(string $id, string $sourceType, string $sourceName): NeuronDocument + { + $document = new NeuronDocument('content ' . $id); + $document->id = $id; + $document->embedding = $this->createTestEmbedding(); + $document->sourceType = $sourceType; + $document->sourceName = $sourceName; + return $document; + } }