Machine Coding Problem

Email Search Engine

macoAllmessaginginverted-indexthreading
Commonly Asked By:GoogleMicrosoftYahoo

In-Scope Features

  • Dynamic Inverted Index Storage: Tokenizes and registers messages under mapping dictionaries instantly on ingestion.
  • TF-IDF & Field Boost Scoring: Computes keyword matches and boosts terms found inside Subject lines by 3x.
  • Thread Grouping Resolvers: Collapses matching emails into unified visual conversation threads.
  • Positional Adjacency (Phrase Search): Maps arrays of word index locations to support double-quoted phrase lookups.

Production reference implementations demonstrating inverted indexes, stopword filters, TF-IDF rating math, and thread folding:

// ─── JAVA BLUEPRINT ──────────────────────────────────────────────────────────
import java.util.*;
import java.util.concurrent.*;
import java.util.stream.Collectors;

class EmailDocument {
    private final String id;
    private final String subject;
    private final String body;
    private final String sender;
    private final String threadId;
    private final long timestamp;
    private final boolean hasAttachment;

    public EmailDocument(String id, String subject, String body, String sender, 
                         String threadId, long timestamp, boolean hasAttachment) {
        this.id = id;
        this.subject = subject;
        this.body = body;
        this.sender = sender;
        this.threadId = threadId;
        this.timestamp = timestamp;
        this.hasAttachment = hasAttachment;
    }

    public String getId() { return id; }
    public String getSubject() { return subject; }
    public String getBody() { return body; }
    public String getSender() { return sender; }
    public String getThreadId() { return threadId; }
    public long getTimestamp() { return timestamp; }
    public boolean isHasAttachment() { return hasAttachment; }
}

class Posting {
    private final String docId;
    private final int termFrequency;
    private final List<Integer> positions;
    private final boolean inSubject;

    public Posting(String docId, int termFrequency, List<Integer> positions, boolean inSubject) {
        this.docId = docId;
        this.termFrequency = termFrequency;
        this.positions = positions;
        this.inSubject = inSubject;
    }

    public String getDocId() { return docId; }
    public int getTermFrequency() { return termFrequency; }
    public List<Integer> getPositions() { return positions; }
    public boolean isInSubject() { return inSubject; }
}

class InvertedIndex {
    private final Map<String, List<Posting>> index = new ConcurrentHashMap<>();
    private final Set<String> stopwords = new HashSet<>(Arrays.asList("the", "a", "an", "and", "is", "to", "in", "of", "for", "with", "we", "i", "will"));

    public List<String> tokenize(String text) {
        if (text == null) return Collections.emptyList();
        return Arrays.stream(text.toLowerCase().replaceAll("[^a-zA-Z0-9\\s]", "").split("\\s+"))
            .filter(word -> !word.isEmpty() && !stopwords.contains(word))
            .collect(Collectors.toList());
    }

    public void addDocument(EmailDocument doc) {
        // Tokenize Subject
        List<String> subjectTokens = tokenize(doc.getSubject());
        for (int i = 0; i < subjectTokens.size(); i++) {
            recordToken(subjectTokens.get(i), doc.getId(), i, true);
        }

        // Tokenize Body
        List<String> bodyTokens = tokenize(doc.getBody());
        for (int i = 0; i < bodyTokens.size(); i++) {
            recordToken(bodyTokens.get(i), doc.getId(), i, false);
        }
    }

    private synchronized void recordToken(String token, String docId, int position, boolean inSubject) {
        List<Posting> postings = index.computeIfAbsent(token, k -> new ArrayList<>());
        
        Posting existing = postings.stream().filter(p -> p.getDocId().equals(docId)).findFirst().orElse(null);
        if (existing != null) {
            existing.getPositions().add(position);
            // Re-wrap frequency count
            postings.remove(existing);
            postings.add(new Posting(docId, existing.getTermFrequency() + 1, existing.getPositions(), existing.isInSubject() || inSubject));
        } else {
            List<Integer> pos = new ArrayList<>();
            pos.add(position);
            postings.add(new Posting(docId, 1, pos, inSubject));
        }
    }

    public List<Posting> getPostings(String token) {
        return index.getOrDefault(token.toLowerCase(), Collections.emptyList());
    }

    public Map<String, List<Posting>> getRawIndex() {
        return Collections.unmodifiableMap(index);
    }
}

class SearchQuery {
    private final String keywordQuery;
    private final String senderFilter;
    private final Boolean attachmentFilter;
    private final boolean groupByThread;

    public SearchQuery(String keywordQuery, String senderFilter, Boolean attachmentFilter, boolean groupByThread) {
        this.keywordQuery = keywordQuery;
        this.senderFilter = senderFilter;
        this.attachmentFilter = attachmentFilter;
        this.groupByThread = groupByThread;
    }

    public String getKeywordQuery() { return keywordQuery; }
    public String getSenderFilter() { return senderFilter; }
    public Boolean getAttachmentFilter() { return attachmentFilter; }
    public boolean isGroupByThread() { return groupByThread; }
}

class SearchResult {
    private final EmailDocument document;
    private final double score;
    private final String matchedFieldsDesc;

    public SearchResult(EmailDocument document, double score, String matchedFieldsDesc) {
        this.document = document;
        this.score = score;
        this.matchedFieldsDesc = matchedFieldsDesc;
    }

    public EmailDocument getDocument() { return document; }
    public double getScore() { return score; }
    public String getMatchedFieldsDesc() { return matchedFieldsDesc; }
}

class QueryPlanner {
    private final InvertedIndex index;
    private final Map<String, EmailDocument> docStore = new ConcurrentHashMap<>();

    public QueryPlanner(InvertedIndex index) {
        this.index = index;
    }

    public void registerDocument(EmailDocument doc) {
        docStore.put(doc.getId(), doc);
        index.addDocument(doc);
    }

    public List<SearchResult> search(SearchQuery query) {
        List<String> terms = index.tokenize(query.getKeywordQuery());
        if (terms.isEmpty()) return Collections.emptyList();

        Map<String, Double> docScores = new HashMap<>();
        Map<String, List<String>> matchedTerms = new HashMap<>();

        for (String term : terms) {
            List<Posting> postings = index.getPostings(term);
            for (Posting p : postings) {
                // Scoring Calculation: TF-IDF with subject boosting (3x weight multiplier)
                double tf = p.getTermFrequency();
                double idf = Math.log(1.0 + (double) docStore.size() / (1.0 + postings.size()));
                double fieldBoost = p.isInSubject() ? 3.0 : 1.0;
                
                double score = tf * idf * fieldBoost;
                docScores.merge(p.getDocId(), score, Double::sum);
                matchedTerms.computeIfAbsent(p.getDocId(), k -> new ArrayList<>()).add(term);
            }
        }

        // Apply metadata filters post-retrieval
        List<SearchResult> results = new ArrayList<>();
        for (Map.Entry<String, Double> entry : docScores.entrySet()) {
            EmailDocument doc = docStore.get(entry.getKey());
            if (doc == null) continue;

            if (query.getSenderFilter() != null && !doc.getSender().equalsIgnoreCase(query.getSenderFilter())) {
                continue;
            }
            if (query.getAttachmentFilter() != null && doc.isHasAttachment() != query.getAttachmentFilter()) {
                continue;
            }

            results.add(new SearchResult(doc, entry.getValue(), matchedTerms.get(doc.getId()).toString()));
        }

        // Sort descending by score
        results.sort((a, b) -> Double.compare(b.getScore(), a.getScore()));

        // Group by thread if requested
        if (query.isGroupByThread()) {
            Map<String, List<SearchResult>> threads = results.stream()
                .collect(Collectors.groupingBy(r -> r.getDocument().getThreadId()));
            
            // Pick highest scoring email in each thread as representative
            List<SearchResult> grouped = new ArrayList<>();
            for (List<SearchResult> threadResults : threads.values()) {
                threadResults.stream().max(Comparator.comparingDouble(SearchResult::getScore)).ifPresent(grouped::add);
            }
            grouped.sort((a, b) -> Double.compare(b.getScore(), a.getScore()));
            return grouped;
        }

        return results;
    }
}

public class Main {
    public static void main(String[] args) {
        InvertedIndex index = new InvertedIndex();
        QueryPlanner planner = new QueryPlanner(index);

        EmailDocument email1 = new EmailDocument("E1", "Project roadmap planning meeting tomorrow", "Let us discuss key search rankings and atomic index updates.", "alice@company.com", "thread_1", 1716300000000L, false);
        EmailDocument email2 = new EmailDocument("E2", "Re: Project roadmap planning meeting tomorrow", "Sounds great! I will attach the index draft file for review.", "bob@company.com", "thread_1", 1716300900000L, true);
        EmailDocument email3 = new EmailDocument("E3", "Index server memory crash incident report", "The inverted index servers encountered an out of memory exception under heavy query traffic.", "devops@company.com", "thread_2", 1716302700000L, false);

        planner.registerDocument(email1);
        planner.registerDocument(email2);
        planner.registerDocument(email3);

        System.out.println("--- Search for 'index' ---");
        SearchQuery query1 = new SearchQuery("index", null, null, false);
        List<SearchResult> results1 = planner.search(query1);
        for (SearchResult res : results1) {
            System.out.println("[" + res.getDocument().getId() + "] Subject: " + res.getDocument().getSubject() + " | Score: " + res.getScore() + " | Matched: " + res.getMatchedFieldsDesc());
        }

        System.out.println("\n--- Search with Group by Thread ---");
        SearchQuery query2 = new SearchQuery("index", null, null, true);
        List<SearchResult> results2 = planner.search(query2);
        for (SearchResult res : results2) {
            System.out.println("[" + res.getDocument().getId() + "] Thread: " + res.getDocument().getThreadId() + " | Subject: " + res.getDocument().getSubject() + " | Score: " + res.getScore());
        }
    }
}