In-Scope Features
- Dynamic Inverted Index Storage: Tokenizes and registers messages under mapping dictionaries instantly on ingestion.
- TF-IDF & Field Boost Scoring: Computes keyword matches and boosts terms found inside
Subjectlines by 3x. - Thread Grouping Resolvers: Collapses matching emails into unified visual conversation threads.
- Positional Adjacency (Phrase Search): Maps arrays of word index locations to support double-quoted phrase lookups.
Production reference implementations demonstrating inverted indexes, stopword filters, TF-IDF rating math, and thread folding:
// ─── JAVA BLUEPRINT ──────────────────────────────────────────────────────────
import java.util.*;
import java.util.concurrent.*;
import java.util.stream.Collectors;
class EmailDocument {
private final String id;
private final String subject;
private final String body;
private final String sender;
private final String threadId;
private final long timestamp;
private final boolean hasAttachment;
public EmailDocument(String id, String subject, String body, String sender,
String threadId, long timestamp, boolean hasAttachment) {
this.id = id;
this.subject = subject;
this.body = body;
this.sender = sender;
this.threadId = threadId;
this.timestamp = timestamp;
this.hasAttachment = hasAttachment;
}
public String getId() { return id; }
public String getSubject() { return subject; }
public String getBody() { return body; }
public String getSender() { return sender; }
public String getThreadId() { return threadId; }
public long getTimestamp() { return timestamp; }
public boolean isHasAttachment() { return hasAttachment; }
}
class Posting {
private final String docId;
private final int termFrequency;
private final List<Integer> positions;
private final boolean inSubject;
public Posting(String docId, int termFrequency, List<Integer> positions, boolean inSubject) {
this.docId = docId;
this.termFrequency = termFrequency;
this.positions = positions;
this.inSubject = inSubject;
}
public String getDocId() { return docId; }
public int getTermFrequency() { return termFrequency; }
public List<Integer> getPositions() { return positions; }
public boolean isInSubject() { return inSubject; }
}
class InvertedIndex {
private final Map<String, List<Posting>> index = new ConcurrentHashMap<>();
private final Set<String> stopwords = new HashSet<>(Arrays.asList("the", "a", "an", "and", "is", "to", "in", "of", "for", "with", "we", "i", "will"));
public List<String> tokenize(String text) {
if (text == null) return Collections.emptyList();
return Arrays.stream(text.toLowerCase().replaceAll("[^a-zA-Z0-9\\s]", "").split("\\s+"))
.filter(word -> !word.isEmpty() && !stopwords.contains(word))
.collect(Collectors.toList());
}
public void addDocument(EmailDocument doc) {
// Tokenize Subject
List<String> subjectTokens = tokenize(doc.getSubject());
for (int i = 0; i < subjectTokens.size(); i++) {
recordToken(subjectTokens.get(i), doc.getId(), i, true);
}
// Tokenize Body
List<String> bodyTokens = tokenize(doc.getBody());
for (int i = 0; i < bodyTokens.size(); i++) {
recordToken(bodyTokens.get(i), doc.getId(), i, false);
}
}
private synchronized void recordToken(String token, String docId, int position, boolean inSubject) {
List<Posting> postings = index.computeIfAbsent(token, k -> new ArrayList<>());
Posting existing = postings.stream().filter(p -> p.getDocId().equals(docId)).findFirst().orElse(null);
if (existing != null) {
existing.getPositions().add(position);
// Re-wrap frequency count
postings.remove(existing);
postings.add(new Posting(docId, existing.getTermFrequency() + 1, existing.getPositions(), existing.isInSubject() || inSubject));
} else {
List<Integer> pos = new ArrayList<>();
pos.add(position);
postings.add(new Posting(docId, 1, pos, inSubject));
}
}
public List<Posting> getPostings(String token) {
return index.getOrDefault(token.toLowerCase(), Collections.emptyList());
}
public Map<String, List<Posting>> getRawIndex() {
return Collections.unmodifiableMap(index);
}
}
class SearchQuery {
private final String keywordQuery;
private final String senderFilter;
private final Boolean attachmentFilter;
private final boolean groupByThread;
public SearchQuery(String keywordQuery, String senderFilter, Boolean attachmentFilter, boolean groupByThread) {
this.keywordQuery = keywordQuery;
this.senderFilter = senderFilter;
this.attachmentFilter = attachmentFilter;
this.groupByThread = groupByThread;
}
public String getKeywordQuery() { return keywordQuery; }
public String getSenderFilter() { return senderFilter; }
public Boolean getAttachmentFilter() { return attachmentFilter; }
public boolean isGroupByThread() { return groupByThread; }
}
class SearchResult {
private final EmailDocument document;
private final double score;
private final String matchedFieldsDesc;
public SearchResult(EmailDocument document, double score, String matchedFieldsDesc) {
this.document = document;
this.score = score;
this.matchedFieldsDesc = matchedFieldsDesc;
}
public EmailDocument getDocument() { return document; }
public double getScore() { return score; }
public String getMatchedFieldsDesc() { return matchedFieldsDesc; }
}
class QueryPlanner {
private final InvertedIndex index;
private final Map<String, EmailDocument> docStore = new ConcurrentHashMap<>();
public QueryPlanner(InvertedIndex index) {
this.index = index;
}
public void registerDocument(EmailDocument doc) {
docStore.put(doc.getId(), doc);
index.addDocument(doc);
}
public List<SearchResult> search(SearchQuery query) {
List<String> terms = index.tokenize(query.getKeywordQuery());
if (terms.isEmpty()) return Collections.emptyList();
Map<String, Double> docScores = new HashMap<>();
Map<String, List<String>> matchedTerms = new HashMap<>();
for (String term : terms) {
List<Posting> postings = index.getPostings(term);
for (Posting p : postings) {
// Scoring Calculation: TF-IDF with subject boosting (3x weight multiplier)
double tf = p.getTermFrequency();
double idf = Math.log(1.0 + (double) docStore.size() / (1.0 + postings.size()));
double fieldBoost = p.isInSubject() ? 3.0 : 1.0;
double score = tf * idf * fieldBoost;
docScores.merge(p.getDocId(), score, Double::sum);
matchedTerms.computeIfAbsent(p.getDocId(), k -> new ArrayList<>()).add(term);
}
}
// Apply metadata filters post-retrieval
List<SearchResult> results = new ArrayList<>();
for (Map.Entry<String, Double> entry : docScores.entrySet()) {
EmailDocument doc = docStore.get(entry.getKey());
if (doc == null) continue;
if (query.getSenderFilter() != null && !doc.getSender().equalsIgnoreCase(query.getSenderFilter())) {
continue;
}
if (query.getAttachmentFilter() != null && doc.isHasAttachment() != query.getAttachmentFilter()) {
continue;
}
results.add(new SearchResult(doc, entry.getValue(), matchedTerms.get(doc.getId()).toString()));
}
// Sort descending by score
results.sort((a, b) -> Double.compare(b.getScore(), a.getScore()));
// Group by thread if requested
if (query.isGroupByThread()) {
Map<String, List<SearchResult>> threads = results.stream()
.collect(Collectors.groupingBy(r -> r.getDocument().getThreadId()));
// Pick highest scoring email in each thread as representative
List<SearchResult> grouped = new ArrayList<>();
for (List<SearchResult> threadResults : threads.values()) {
threadResults.stream().max(Comparator.comparingDouble(SearchResult::getScore)).ifPresent(grouped::add);
}
grouped.sort((a, b) -> Double.compare(b.getScore(), a.getScore()));
return grouped;
}
return results;
}
}
public class Main {
public static void main(String[] args) {
InvertedIndex index = new InvertedIndex();
QueryPlanner planner = new QueryPlanner(index);
EmailDocument email1 = new EmailDocument("E1", "Project roadmap planning meeting tomorrow", "Let us discuss key search rankings and atomic index updates.", "alice@company.com", "thread_1", 1716300000000L, false);
EmailDocument email2 = new EmailDocument("E2", "Re: Project roadmap planning meeting tomorrow", "Sounds great! I will attach the index draft file for review.", "bob@company.com", "thread_1", 1716300900000L, true);
EmailDocument email3 = new EmailDocument("E3", "Index server memory crash incident report", "The inverted index servers encountered an out of memory exception under heavy query traffic.", "devops@company.com", "thread_2", 1716302700000L, false);
planner.registerDocument(email1);
planner.registerDocument(email2);
planner.registerDocument(email3);
System.out.println("--- Search for 'index' ---");
SearchQuery query1 = new SearchQuery("index", null, null, false);
List<SearchResult> results1 = planner.search(query1);
for (SearchResult res : results1) {
System.out.println("[" + res.getDocument().getId() + "] Subject: " + res.getDocument().getSubject() + " | Score: " + res.getScore() + " | Matched: " + res.getMatchedFieldsDesc());
}
System.out.println("\n--- Search with Group by Thread ---");
SearchQuery query2 = new SearchQuery("index", null, null, true);
List<SearchResult> results2 = planner.search(query2);
for (SearchResult res : results2) {
System.out.println("[" + res.getDocument().getId() + "] Thread: " + res.getDocument().getThreadId() + " | Subject: " + res.getDocument().getSubject() + " | Score: " + res.getScore());
}
}
}