I am trying to implement an n-Gram/2L-approximation index from a paper by Min-Soo Kim, Kyu-Young Whang and Jae-Gil Lee, found here: http://infolab.dgist.ac.kr/~mskim/papers/CSSE07.pdf
Building the Index is quite straight forward. Where I am getting lost is the querying algorithm. Specifically performing the merge outer join.
The algorithm extracts n-grams from the query string Q by the 1-sliding technique and searches the posting lists of those n-grams in the front-end index. Then, the algorithm performs merge outer join among those posting lists using the m-subsequence identifier as the join attribute and finds the set {Si} of candidate m-subsequences that satisfy the necessary condition in Theorem 1.
So far, I got this piece of code:
from math import ceil, floor
class NGramIndex:
def __init__(self, m: int, n: int):
self.m: int = m # m-subsequence length
self.n: int = n # n-gram length
self.backend_index = dict()
self.frontend_index = dict()
self.msubseq_set = [] # Set of msubsequences
def append(self, doc: str, doc_id: int):
N = len(doc)
max_range = ceil(N / self.m)
for i in range(0, max_range):
offset = i * self.m
msubseq = doc[i * self.m: i * self.m + self.m]
# if extracted subseq is smaller, pad it with extra-char
if len(msubseq) < self.m:
msubseq += '$' * (self.m - len(msubseq))
if msubseq not in self.backend_index:
self.backend_index[msubseq] = [(doc_id, [offset])]
elif self.backend_index[msubseq][-1][0] == doc_id:
self.backend_index[msubseq][-1][1].append(offset)
else:
self.backend_index[msubseq].append((doc_id, [offset]))
if msubseq in self.msubseq_set:
# subseq_id is the unique identifier in msubseq_set
subseq_id = self.msubseq_set.index(msubseq)
else:
self.msubseq_set.append(msubseq)
subseq_id = len(self.msubseq_set) - 1
max_q_range = self.m - self.n + 1
for ngram_offset in range(0, max_q_range):
ngram = msubseq[ngram_offset:ngram_offset + self.n]
if ngram not in self.frontend_index:
self.frontend_index[ngram] = [(subseq_id, [ngram_offset])]
elif self.frontend_index[ngram][-1][0] == subseq_id:
self.frontend_index[ngram][-1][1].append(ngram_offset)
else:
self.frontend_index[ngram].append((subseq_id, [ngram_offset]))
def query(self, query_word: str, k: int):
"""
Query the index for results
k = error tolerance (threshold)
"""
t = floor((len(query_word) + 1) / self.m) - 1
eps = floor(k / t)
# r is used for filtration later
r = (self.m - self.n + 1) - (eps * self.n)
postings = []
for i in range(0, len(query_word) - self.n + 1):
ngram = query_word[i: i + self.n]
if ngram in self.frontend_index:
postings.append(self.frontend_index[ngram])
# TODO: Perform merge outer join?
and I am unsure how to proceed with the join.
I am glad for any suggestions, references, anything.