added tokenSplitRegex option

fergiemcdowall · Apr 23, 2022 · 65ad3dc · 65ad3dc
1 parent b6d85be
commit 65ad3dc
Show file tree

Hide file tree

Showing 11 changed files with 111 additions and 36 deletions.
diff --git a/dist/search-index-3.0.3.js → dist/search-index-3.1.0.js b/dist/search-index-3.0.3.js → dist/search-index-3.1.0.js
diff --git a/dist/search-index-3.0.3.js.LICENSE.txt → dist/search-index-3.1.0.js.LICENSE.txt b/dist/search-index-3.0.3.js.LICENSE.txt → dist/search-index-3.1.0.js.LICENSE.txt
diff --git a/dist/search-index-esm-3.0.3.js → dist/search-index-esm-3.1.0.js b/dist/search-index-esm-3.0.3.js → dist/search-index-esm-3.1.0.js
diff --git a/dist/search-index-esm-3.0.3.js.LICENSE.txt → dist/search-index-esm-3.1.0.js.LICENSE.txt b/dist/search-index-esm-3.0.3.js.LICENSE.txt → dist/search-index-esm-3.1.0.js.LICENSE.txt
diff --git a/dist/search-index.js b/dist/search-index.js
diff --git a/docs/API.md b/docs/API.md
@@ -354,7 +354,7 @@ If any document does not contain an `_id` field, then one will be
 generated and assigned
 
 
-`options` is an optional object that can contain the following values:
+`options` is an optional object that can contain the following values. These values can also be set when initialising the index rather than in every `PUT`:
 
 | Name | Type | Default | Description |
 |---|---|---|---|
@@ -366,6 +366,7 @@ generated and assigned
 |`storeRawDocs`|`boolean`|`true`|Whether to store the raw document or not. In many cases it may be desirable to store it externally, or to skip storing when indexing if it is going to be updated directly later on|
 |`storeVectors`|`boolean`|`false`|When `true`, documents will be deletable and overwritable, but will take up more space on disk|
 |`tokenizationPipeline`|`Array`|<pre lang="javascript">[<br>  SPLIT,<br>  SKIP,<br>  LOWCASE,<br>  REPLACE,<br>  NGRAMS,<br>  STOPWORDS,<br>  SCORE_TERM_FREQUENCY<br>]</pre>| Tokenisation pipeline. Stages can be added and reordered|
+|`tokenSplitRegex`|`RegExp`| `/[\p{L}\d]+/gu` | The regular expression that splits strings into tokens|
 
 ### Tokenization pipeline when indexing
 

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "search-index",
-  "version": "3.0.3",
+  "version": "3.1.0",
   "description": "A network resilient, persistent full-text search library for the browser and Node.js",
   "keywords": [
     "search",

diff --git a/src/main.js b/src/main.js
@@ -86,12 +86,11 @@ const initIndex = (ops = {}) =>
         ngrams: {},
         replace: {},
         storeRawDocs: true,
-        // TODO: processDocuments probably shouldn't be an option?
-
-        tokenizer: tp.tokenizer,
         stopwords: [],
         storeVectors: true, // TODO: make a test for this being false
-        tokenAppend: '#'
+        tokenAppend: '#',
+        tokenSplitRegex: /[\p{L}\d]+/gu,
+        tokenizer: tp.tokenizer
       },
       ops
     )

diff --git a/src/tokenisationPipeline.js b/src/tokenisationPipeline.js
@@ -2,7 +2,7 @@ const tv = require('term-vector')
 const ngraminator = require('ngraminator')
 
 const SPLIT = ([tokens, field, ops]) =>
-  Promise.resolve([tokens.match(/[\p{L}\d]+/gu) || [], field, ops])
+  Promise.resolve([tokens.match(ops.tokenSplitRegex) || [], field, ops])
 
 const SKIP = ([tokens, field, ops]) => [
   ops.skipFields.includes(field) ? [] : tokens,

diff --git a/test/src/issue-571-test.js b/test/src/issue-571-test.js
@@ -56,27 +56,4 @@ test('set up as per issue #571', async function (t) {
       RESULT_LENGTH: 1
     }
   )
-
-  // t.ok(await FLUSH())
-
-  // t.deepEquals(await SEARCH(['q']), {
-  //   RESULT: [],
-  //   RESULT_LENGTH: 0
-  // })
-
-  // t.deepEquals(await PUT(docs), [
-  //   { _id: 'qwertyu', status: 'CREATED', operation: 'PUT' },
-  //   { _id: 'asdfgh', status: 'CREATED', operation: 'PUT' }
-  // ])
-
-  // t.deepEquals(await SEARCH(['q']), {
-  //   RESULT: [
-  //     {
-  //       _id: 'qwertyu',
-  //       _match: [{ FIELD: 'idx', VALUE: 'q', SCORE: '1.00' }],
-  //       _score: 1.1
-  //     }
-  //   ],
-  //   RESULT_LENGTH: 1
-  // })
 })
diff --git a/test/src/issue-577-test.js b/test/src/issue-577-test.js
@@ -0,0 +1,98 @@
+const si = require('../../')
+const test = require('tape')
+
+const sandbox = 'test/sandbox/'
+
+const docs = [
+  {
+    _id: 1,
+    bandName: '*',
+    description: 'The nice boys of pop'
+  },
+  {
+    _id: 'two',
+    bandName: 'The Rolling Stones',
+    description: 'The bad boys of rock'
+  },
+  {
+    _id: 3,
+    bandName: 'The Who',
+    description: 'Nearly as good as Led Zeppelin'
+  }
+]
+
+test('set up as per issue #577', async function (t) {
+  t.plan(2)
+
+  const { PUT, SEARCH } = await si({
+    name: sandbox + '577',
+    tokenSplitRegex: /[\p{L}\d*]+/gu
+  })
+
+  t.deepEquals(await PUT(docs), [
+    { _id: 1, operation: 'PUT', status: 'CREATED' },
+    { _id: 'two', operation: 'PUT', status: 'CREATED' },
+    { _id: 3, operation: 'PUT', status: 'CREATED' }
+  ])
+
+  t.deepEquals(
+    await SEARCH(['*'], {
+      DOCUMENTS: true
+    }),
+    {
+      RESULT: [
+        {
+          _id: 1,
+          _match: [{ FIELD: 'bandname', VALUE: '*', SCORE: '1.00' }],
+          _score: 1.39,
+          _doc: {
+            _id: 1,
+            bandName: '*',
+            description: 'The nice boys of pop'
+          }
+        }
+      ],
+      RESULT_LENGTH: 1
+    }
+  )
+})
+
+test('set up as per issue #577', async function (t) {
+  t.plan(2)
+
+  const { PUT, SEARCH } = await si({
+    name: sandbox + '577-2'
+  })
+
+  t.deepEquals(
+    await PUT(docs, {
+      tokenSplitRegex: /[\p{L}\d*]+/gu
+    }),
+    [
+      { _id: 1, operation: 'PUT', status: 'CREATED' },
+      { _id: 'two', operation: 'PUT', status: 'CREATED' },
+      { _id: 3, operation: 'PUT', status: 'CREATED' }
+    ]
+  )
+
+  t.deepEquals(
+    await SEARCH(['*'], {
+      DOCUMENTS: true
+    }),
+    {
+      RESULT: [
+        {
+          _id: 1,
+          _match: [{ FIELD: 'bandname', VALUE: '*', SCORE: '1.00' }],
+          _score: 1.39,
+          _doc: {
+            _id: 1,
+            bandName: '*',
+            description: 'The nice boys of pop'
+          }
+        }
+      ],
+      RESULT_LENGTH: 1
+    }
+  )
+})