bigrams shmigrams

Former-commit-id: 7947042da92a0c541faa499020d0ff72fd054bf2
This commit is contained in:
Ben
2022-04-25 01:21:15 +01:00
parent 4ce954cd4d
commit 9ff654d973
5 changed files with 76 additions and 79 deletions

View File

@@ -20,14 +20,10 @@ automatically every request
| POST | /api/bulk/brick | array | no | POST due to bulk nature |
| GET | /api/set/:id | | no | |
| GET | /api/cdn/:id | | no | |
| GET | /api/basket/price/ | | no | |
| PUT | /api/auth/login/ | | yes | |
| POST | /api/auth/signup/ | | yes | |
| GET | /api/auth/orders/ | | yes | |
| GET | /api/auth/basket/ | | yes | |
| PUT | /api/auth/basket/:id | quantity | yes | |
| POST | /api/auth/basket/:id | | yes | manipulate basket content |
| DEL | /api/auth/basket/:id | quantity | yes | if no id, delete whole |
| DEL | /api/auth/basket/ | | yes | if no id, delete whole |
Query endpoints do not return the full data on a brick/set, they return
a subset for product listing pages
@@ -52,78 +48,6 @@ brick: brick to search for (absolute type, fuzzy string)
set: brick to search for (absolute, fuzzy string)
### /api/special/
GET /api/special/
Response Object
```json
{
"data": {
"title": "Special 1",
"end": "2020-01-31T00:00:00.000Z",
}
}
```
### /api/type/:id
GET /api/type/:id
Response Object
```json
{
"data": {
"type": "brick", // or set
}
}
```
### /api/search/
GET /api/search?params
### /api/bricks/
GET
Response Object
```json
{
}
```
### /api/sets/
### /api/brick/:id/
### /api/set/:id/
### /api/cdn/:id/
### /api/auth/login/
### /api/auth/signup/
Request Body
```json
{
}
```
Response Object
```json
{
}
```
### /api/auth/orders/
### /api/auth/basket/
## Response Structure
```js

24
docs/QUERY.md Normal file
View File

@@ -0,0 +1,24 @@
# Query
In order to quickly and accurately query my database for hand-written
searches, I need a way to perform a "fuzzy" search on the data at
hand.
I had previously created a simple algorithm to search a database with
a basic posix regex filter and then rank them on the distance to the
input query, which was ordered based on Levenshtein Distance.
Keeping to the theme of querying a large, static data set for
multiple known attributes (name, tags, id). I decided to stick with
the distance function for result relavency but lean into automatic
spell correction with an n-gram model, proposing to the controller,
"the most probable few alternate queries" that the user might have
meant, which can then be queried in the database to go through the
same relavancy distance sorting.
## What I diddn't have time for
My end goal was to create a gradient boosted decision tree which would
correct spelling based on a trained data set of word frequency, n-gram
modeling, distance between words and also a static dictionary. However
this proved out of scope.

View File

@@ -1,2 +0,0 @@

View File

@@ -0,0 +1,43 @@
const axios = require("axios");
let StaticDictionary = [];
async function Init() {
await axios.get('http://www.mieliestronk.com/corncob_lowercase.txt').then(response => {
StaticDictionary = response.data;
});
}
function MostProbableAlternateQueries(query) {
const words = query.split(' ');
const reconstruction = [];
for (let i = 0; i < words.length; i++) {
const mostLikely = MostProbableMissSpelling(words[i]);
reconstruction.push([...mostLikely]);
}
console.log(reconstruction)
// work out a bit of context to determine the most likely sentence
}
function MostProbableMissSpelling(word) {
return [];
}
function TriGrams(word) {
}
function BiGrams(word) {
}
module.exports = {
Init,
MostProbableAlternateQueries,
MostProbableMissSpelling,
};

View File

@@ -5,6 +5,8 @@ const API = require('./routes/api.js');
const Database = require('./database/database.js');
const ngrams = require('./controllers/n-grams.js');
async function main() {
Config.Load();
@@ -21,6 +23,12 @@ async function main() {
Server.Listen(process.env.PORT);
API.Init();
await ngrams.Init();
ngrams.MostProbableAlternateQueries('brick 2x10x4');
ngrams.MostProbableAlternateQueries('lego star wars battlefront');
ngrams.MostProbableAlternateQueries('lego stor was s');
}
main();