dsa

Longest Common Subsequence (LCS): Mastering Two-Sequence DP Patterns

#Dynamic Programming#Algorithms#Go#LCS#String Comparison#Bioinformatics

Longest Common Subsequence (LCS): Mastering Two-Sequence DP Patterns

The Longest Common Subsequence (LCS) problem is fundamental to sequence comparison and forms the backbone of many real-world applications including diff tools, version control systems, and bioinformatics sequence alignment.

Table of Contents

  1. LCS Problem Definition
  2. Basic 2D DP Solution
  3. Space Optimization Techniques
  4. LCS Variations and Extensions
  5. Advanced Multi-Sequence LCS
  6. Real-World Applications
  7. Problem Recognition

LCS Problem Definition {#lcs-definition}

Given two sequences, find the length of the longest subsequence that appears in both sequences in the same relative order.

Key Concepts

graph TD A[LCS Concepts] --> B[Subsequence Properties] A --> C[Comparison Rules] A --> D[DP Structure] B --> B1[Non-contiguous OK] B --> B2[Preserve relative order] B --> B3[Can skip elements] C --> C1[Character match: extend LCS] C --> C2[No match: take best from either side] C --> C3[Empty sequence: LCS = 0] D --> D1[2D table dp[i][j]] D --> D2[Represents prefixes] D --> D3[Bottom-up construction] style C1 fill:#e8f5e8

Example Walkthrough

// Demonstrate LCS with step-by-step trace
func demonstrateLCS() {
    text1 := "ABCDGH"
    text2 := "AEDFHR"
    
    fmt.Printf("Text1: %s\n", text1)
    fmt.Printf("Text2: %s\n", text2)
    
    // Expected LCS: "ADH" (length 3)
    length := longestCommonSubsequence(text1, text2)
    fmt.Printf("LCS Length: %d\n", length)
    
    sequence := getLCS(text1, text2)
    fmt.Printf("LCS Sequence: %s\n", sequence)
    
    // Show all possible LCS of this length
    allLCS := getAllLCS(text1, text2)
    fmt.Printf("All LCS: %v\n", allLCS)
}

Basic 2D DP Solution {#basic-lcs-solution}

Classic LCS Algorithm

// Classic LCS between two strings - O(mn) time and space
func longestCommonSubsequence(text1, text2 string) int {
    m, n := len(text1), len(text2)
    
    // dp[i][j] = LCS length for text1[0:i] and text2[0:j]
    dp := make([][]int, m+1)
    for i := range dp {
        dp[i] = make([]int, n+1)
    }
    
    // Fill the DP table
    for i := 1; i <= m; i++ {
        for j := 1; j <= n; j++ {
            if text1[i-1] == text2[j-1] {
                // Characters match: extend the LCS
                dp[i][j] = dp[i-1][j-1] + 1
            } else {
                // No match: take the better of two options
                dp[i][j] = max(dp[i-1][j], dp[i][j-1])
            }
        }
    }
    
    return dp[m][n]
}

func max(a, b int) int {
    if a > b {
        return a
    }
    return b
}

LCS with DP Table Visualization

// Visualize the DP table construction
func visualizeLCSDP(text1, text2 string) {
    m, n := len(text1), len(text2)
    dp := make([][]int, m+1)
    for i := range dp {
        dp[i] = make([]int, n+1)
    }
    
    fmt.Printf("LCS DP Table Construction:\n")
    fmt.Printf("Text1: %s\n", text1)
    fmt.Printf("Text2: %s\n", text2)
    fmt.Println()
    
    // Print header
    fmt.Printf("    ε ")
    for j := 0; j < n; j++ {
        fmt.Printf(" %c ", text2[j])
    }
    fmt.Println()
    
    // Fill and print table row by row
    for i := 0; i <= m; i++ {
        if i == 0 {
            fmt.Printf("ε ")
        } else {
            fmt.Printf("%c ", text1[i-1])
        }
        
        for j := 0; j <= n; j++ {
            if i == 0 || j == 0 {
                dp[i][j] = 0
            } else if text1[i-1] == text2[j-1] {
                dp[i][j] = dp[i-1][j-1] + 1
            } else {
                dp[i][j] = max(dp[i-1][j], dp[i][j-1])
            }
            
            fmt.Printf(" %d ", dp[i][j])
        }
        fmt.Println()
    }
}

Reconstructing the LCS String

// Get the actual LCS string, not just the length
func getLCS(text1, text2 string) string {
    m, n := len(text1), len(text2)
    dp := make([][]int, m+1)
    for i := range dp {
        dp[i] = make([]int, n+1)
    }
    
    // Fill DP table
    for i := 1; i <= m; i++ {
        for j := 1; j <= n; j++ {
            if text1[i-1] == text2[j-1] {
                dp[i][j] = dp[i-1][j-1] + 1
            } else {
                dp[i][j] = max(dp[i-1][j], dp[i][j-1])
            }
        }
    }
    
    // Backtrack to build LCS
    lcs := make([]byte, dp[m][n])
    i, j, k := m, n, dp[m][n]-1
    
    for i > 0 && j > 0 {
        if text1[i-1] == text2[j-1] {
            // This character is part of LCS
            lcs[k] = text1[i-1]
            k--
            i--
            j--
        } else if dp[i-1][j] > dp[i][j-1] {
            // Move up (text1 contributes more)
            i--
        } else {
            // Move left (text2 contributes more)
            j--
        }
    }
    
    return string(lcs)
}

Finding All Possible LCS

// Find all possible LCS strings (there can be multiple)
func getAllLCS(text1, text2 string) []string {
    m, n := len(text1), len(text2)
    dp := make([][]int, m+1)
    for i := range dp {
        dp[i] = make([]int, n+1)
    }
    
    // Fill DP table
    for i := 1; i <= m; i++ {
        for j := 1; j <= n; j++ {
            if text1[i-1] == text2[j-1] {
                dp[i][j] = dp[i-1][j-1] + 1
            } else {
                dp[i][j] = max(dp[i-1][j], dp[i][j-1])
            }
        }
    }
    
    var allLCS []string
    backtrackAllLCS(text1, text2, dp, m, n, "", &allLCS)
    
    return removeDuplicates(allLCS)
}

func backtrackAllLCS(text1, text2 string, dp [][]int, i, j int, current string, result *[]string) {
    if i == 0 || j == 0 {
        // Reverse the string since we built it backwards
        *result = append(*result, reverse(current))
        return
    }
    
    if text1[i-1] == text2[j-1] {
        // This character must be in the LCS
        backtrackAllLCS(text1, text2, dp, i-1, j-1, current+string(text1[i-1]), result)
    } else {
        // Explore both directions where LCS length is maximum
        if dp[i-1][j] == dp[i][j] {
            backtrackAllLCS(text1, text2, dp, i-1, j, current, result)
        }
        if dp[i][j-1] == dp[i][j] {
            backtrackAllLCS(text1, text2, dp, i, j-1, current, result)
        }
    }
}

func reverse(s string) string {
    runes := []rune(s)
    for i, j := 0, len(runes)-1; i < j; i, j = i+1, j-1 {
        runes[i], runes[j] = runes[j], runes[i]
    }
    return string(runes)
}

func removeDuplicates(strings []string) []string {
    seen := make(map[string]bool)
    var result []string
    
    for _, s := range strings {
        if !seen[s] {
            seen[s] = true
            result = append(result, s)
        }
    }
    
    return result
}

Space Optimization Techniques {#space-optimization}

Rolling Array Optimization

// Space-optimized LCS using only two rows - O(min(m,n)) space
func longestCommonSubsequenceOptimized(text1, text2 string) int {
    m, n := len(text1), len(text2)
    
    // Ensure text1 is the shorter string for better space complexity
    if m > n {
        return longestCommonSubsequenceOptimized(text2, text1)
    }
    
    // Use only two rows instead of full m×n table
    prev := make([]int, m+1)
    curr := make([]int, m+1)
    
    for j := 1; j <= n; j++ {
        for i := 1; i <= m; i++ {
            if text1[i-1] == text2[j-1] {
                curr[i] = prev[i-1] + 1
            } else {
                curr[i] = max(prev[i], curr[i-1])
            }
        }
        prev, curr = curr, prev
    }
    
    return prev[m]
}

Space-Optimized with Path Reconstruction

// Hirschberg's algorithm for space-efficient LCS with path reconstruction
func lcsHirschberg(text1, text2 string) string {
    m, n := len(text1), len(text2)
    
    if m == 0 || n == 0 {
        return ""
    }
    
    if m == 1 {
        // Base case: single character in text1
        if strings.Contains(text2, text1) {
            return text1
        }
        return ""
    }
    
    // Divide: find the middle row
    mid := m / 2
    
    // Compute LCS lengths from left and right
    left := lcsLengthForward(text1[:mid], text2)
    right := lcsLengthBackward(text1[mid:], text2)
    
    // Find the optimal split point
    maxSum := -1
    splitJ := 0
    
    for j := 0; j <= n; j++ {
        if left[j]+right[n-j] > maxSum {
            maxSum = left[j] + right[n-j]
            splitJ = j
        }
    }
    
    // Conquer: recursively solve subproblems
    leftLCS := lcsHirschberg(text1[:mid], text2[:splitJ])
    rightLCS := lcsHirschberg(text1[mid:], text2[splitJ:])
    
    return leftLCS + rightLCS
}

func lcsLengthForward(text1, text2 string) []int {
    m, n := len(text1), len(text2)
    dp := make([]int, n+1)
    
    for i := 1; i <= m; i++ {
        prev := 0
        for j := 1; j <= n; j++ {
            temp := dp[j]
            if text1[i-1] == text2[j-1] {
                dp[j] = prev + 1
            } else {
                dp[j] = max(dp[j], dp[j-1])
            }
            prev = temp
        }
    }
    
    return dp
}

func lcsLengthBackward(text1, text2 string) []int {
    m, n := len(text1), len(text2)
    dp := make([]int, n+1)
    
    for i := m - 1; i >= 0; i-- {
        prev := 0
        for j := n - 1; j >= 0; j-- {
            temp := dp[j]
            if text1[i] == text2[j] {
                dp[j] = prev + 1
            } else {
                dp[j] = max(dp[j], dp[j+1])
            }
            prev = temp
        }
    }
    
    return dp
}

LCS Variations and Extensions {#lcs-variations}

Longest Common Substring (Contiguous)

// Longest Common Substring - must be contiguous
func longestCommonSubstring(text1, text2 string) string {
    m, n := len(text1), len(text2)
    
    // dp[i][j] = length of common substring ending at i-1, j-1
    dp := make([][]int, m+1)
    for i := range dp {
        dp[i] = make([]int, n+1)
    }
    
    maxLength := 0
    endingI := 0
    
    for i := 1; i <= m; i++ {
        for j := 1; j <= n; j++ {
            if text1[i-1] == text2[j-1] {
                dp[i][j] = dp[i-1][j-1] + 1
                if dp[i][j] > maxLength {
                    maxLength = dp[i][j]
                    endingI = i
                }
            } else {
                dp[i][j] = 0  // Reset for contiguous requirement
            }
        }
    }
    
    if maxLength == 0 {
        return ""
    }
    
    return text1[endingI-maxLength : endingI]
}

Shortest Common Supersequence

// Find the shortest string that contains both input strings as subsequences
func shortestCommonSupersequence(str1, str2 string) string {
    m, n := len(str1), len(str2)
    
    // First find LCS using DP
    dp := make([][]int, m+1)
    for i := range dp {
        dp[i] = make([]int, n+1)
    }
    
    for i := 1; i <= m; i++ {
        for j := 1; j <= n; j++ {
            if str1[i-1] == str2[j-1] {
                dp[i][j] = dp[i-1][j-1] + 1
            } else {
                dp[i][j] = max(dp[i-1][j], dp[i][j-1])
            }
        }
    }
    
    // Build supersequence by backtracking
    var result []byte
    i, j := m, n
    
    for i > 0 && j > 0 {
        if str1[i-1] == str2[j-1] {
            // Common character: add once
            result = append([]byte{str1[i-1]}, result...)
            i--
            j--
        } else if dp[i-1][j] > dp[i][j-1] {
            // str1[i-1] not in LCS: add from str1
            result = append([]byte{str1[i-1]}, result...)
            i--
        } else {
            // str2[j-1] not in LCS: add from str2
            result = append([]byte{str2[j-1]}, result...)
            j--
        }
    }
    
    // Add remaining characters
    for i > 0 {
        result = append([]byte{str1[i-1]}, result...)
        i--
    }
    for j > 0 {
        result = append([]byte{str2[j-1]}, result...)
        j--
    }
    
    return string(result)
}

LCS with Character Costs

// LCS where different character matches have different values
func lcsWithCosts(text1, text2 string, matchCosts map[byte]int) int {
    m, n := len(text1), len(text2)
    
    // dp[i][j] = maximum cost LCS for text1[0:i] and text2[0:j]
    dp := make([][]int, m+1)
    for i := range dp {
        dp[i] = make([]int, n+1)
    }
    
    for i := 1; i <= m; i++ {
        for j := 1; j <= n; j++ {
            if text1[i-1] == text2[j-1] {
                // Characters match: add the cost of this character
                cost := matchCosts[text1[i-1]]
                dp[i][j] = dp[i-1][j-1] + cost
            } else {
                // No match: take the better option
                dp[i][j] = max(dp[i-1][j], dp[i][j-1])
            }
        }
    }
    
    return dp[m][n]
}

Advanced Multi-Sequence LCS {#multi-sequence-lcs}

LCS of Three Strings

// LCS of three strings using 3D DP
func longestCommonSubsequence3(text1, text2, text3 string) int {
    m, n, p := len(text1), len(text2), len(text3)
    
    // 3D DP: dp[i][j][k] = LCS for text1[0:i], text2[0:j], text3[0:k]
    dp := make([][][]int, m+1)
    for i := range dp {
        dp[i] = make([][]int, n+1)
        for j := range dp[i] {
            dp[i][j] = make([]int, p+1)
        }
    }
    
    for i := 1; i <= m; i++ {
        for j := 1; j <= n; j++ {
            for k := 1; k <= p; k++ {
                if text1[i-1] == text2[j-1] && text2[j-1] == text3[k-1] {
                    // All three characters match
                    dp[i][j][k] = dp[i-1][j-1][k-1] + 1
                } else {
                    // Take the maximum from the seven possible subproblems
                    dp[i][j][k] = max3(
                        dp[i-1][j][k],     // Skip from text1
                        dp[i][j-1][k],     // Skip from text2
                        dp[i][j][k-1],     // Skip from text3
                    )
                }
            }
        }
    }
    
    return dp[m][n][p]
}

func max3(a, b, c int) int {
    return max(max(a, b), c)
}

LCS of Multiple Sequences (General Solution)

// LCS of multiple sequences using recursive approach with memoization
type MultiLCSMemo struct {
    memo map[string]int
    sequences []string
}

func NewMultiLCSMemo(sequences []string) *MultiLCSMemo {
    return &MultiLCSMemo{
        memo: make(map[string]int),
        sequences: sequences,
    }
}

func (mlcs *MultiLCSMemo) LCS() int {
    indices := make([]int, len(mlcs.sequences))
    return mlcs.lcsRecursive(indices)
}

func (mlcs *MultiLCSMemo) lcsRecursive(indices []int) int {
    // Create key for memoization
    key := fmt.Sprintf("%v", indices)
    if result, exists := mlcs.memo[key]; exists {
        return result
    }
    
    // Check if any sequence is exhausted
    for i, idx := range indices {
        if idx >= len(mlcs.sequences[i]) {
            mlcs.memo[key] = 0
            return 0
        }
    }
    
    // Check if all current characters are the same
    char := mlcs.sequences[0][indices[0]]
    allSame := true
    
    for i := 1; i < len(mlcs.sequences); i++ {
        if mlcs.sequences[i][indices[i]] != char {
            allSame = false
            break
        }
    }
    
    if allSame {
        // All characters match: include this character and advance all
        newIndices := make([]int, len(indices))
        for i := range indices {
            newIndices[i] = indices[i] + 1
        }
        result := 1 + mlcs.lcsRecursive(newIndices)
        mlcs.memo[key] = result
        return result
    }
    
    // Characters don't match: try skipping each sequence
    maxResult := 0
    for i := range indices {
        newIndices := make([]int, len(indices))
        copy(newIndices, indices)
        newIndices[i]++
        
        result := mlcs.lcsRecursive(newIndices)
        maxResult = max(maxResult, result)
    }
    
    mlcs.memo[key] = maxResult
    return maxResult
}

Real-World Applications {#real-world-applications}

Text Diff Implementation

// Text diff system using LCS
type TextDiff struct {
    lines1 []string
    lines2 []string
}

type DiffLine struct {
    Type    string // "add", "delete", "equal"
    Content string
    LineNum1 int
    LineNum2 int
}

func NewTextDiff(text1, text2 string) *TextDiff {
    return &TextDiff{
        lines1: strings.Split(text1, "\n"),
        lines2: strings.Split(text2, "\n"),
    }
}

func (td *TextDiff) ComputeDiff() []DiffLine {
    m, n := len(td.lines1), len(td.lines2)
    
    // Compute LCS table for line comparison
    dp := make([][]int, m+1)
    for i := range dp {
        dp[i] = make([]int, n+1)
    }
    
    for i := 1; i <= m; i++ {
        for j := 1; j <= n; j++ {
            if td.lines1[i-1] == td.lines2[j-1] {
                dp[i][j] = dp[i-1][j-1] + 1
            } else {
                dp[i][j] = max(dp[i-1][j], dp[i][j-1])
            }
        }
    }
    
    // Backtrack to generate diff
    var diff []DiffLine
    i, j := m, n
    
    for i > 0 || j > 0 {
        if i > 0 && j > 0 && td.lines1[i-1] == td.lines2[j-1] {
            // Lines are equal
            diff = append([]DiffLine{{
                Type: "equal",
                Content: td.lines1[i-1],
                LineNum1: i,
                LineNum2: j,
            }}, diff...)
            i--
            j--
        } else if i > 0 && (j == 0 || dp[i][j] == dp[i-1][j]) {
            // Line deleted from first file
            diff = append([]DiffLine{{
                Type: "delete",
                Content: td.lines1[i-1],
                LineNum1: i,
                LineNum2: 0,
            }}, diff...)
            i--
        } else {
            // Line added in second file
            diff = append([]DiffLine{{
                Type: "add",
                Content: td.lines2[j-1],
                LineNum1: 0,
                LineNum2: j,
            }}, diff...)
            j--
        }
    }
    
    return diff
}

func (td *TextDiff) GenerateUnifiedDiff() string {
    diff := td.ComputeDiff()
    var result strings.Builder
    
    lineNum1, lineNum2 := 1, 1
    
    for _, line := range diff {
        switch line.Type {
        case "add":
            result.WriteString(fmt.Sprintf("+%d: %s\n", lineNum2, line.Content))
            lineNum2++
        case "delete":
            result.WriteString(fmt.Sprintf("-%d: %s\n", lineNum1, line.Content))
            lineNum1++
        case "equal":
            result.WriteString(fmt.Sprintf(" %d: %s\n", lineNum1, line.Content))
            lineNum1++
            lineNum2++
        }
    }
    
    return result.String()
}

DNA Sequence Alignment

// Bioinformatics sequence alignment using LCS
type DNAAligner struct {
    matchScore    int
    mismatchScore int
}

func NewDNAAligner(match, mismatch int) *DNAAligner {
    return &DNAAligner{
        matchScore: match,
        mismatchScore: mismatch,
    }
}

func (da *DNAAligner) Align(seq1, seq2 string) (int, string, string) {
    m, n := len(seq1), len(seq2)
    
    // DP table for alignment scores
    dp := make([][]int, m+1)
    for i := range dp {
        dp[i] = make([]int, n+1)
    }
    
    // Fill DP table
    for i := 1; i <= m; i++ {
        for j := 1; j <= n; j++ {
            if seq1[i-1] == seq2[j-1] {
                // Nucleotides match
                dp[i][j] = dp[i-1][j-1] + da.matchScore
            } else {
                // No match: either align with mismatch or gap
                align := dp[i-1][j-1] + da.mismatchScore
                gap1 := dp[i-1][j]    // Gap in seq2
                gap2 := dp[i][j-1]    // Gap in seq1
                
                dp[i][j] = max(align, max(gap1, gap2))
            }
        }
    }
    
    // Backtrack to get alignment
    var aligned1, aligned2 strings.Builder
    i, j := m, n
    
    for i > 0 || j > 0 {
        if i > 0 && j > 0 {
            align := dp[i-1][j-1]
            if seq1[i-1] == seq2[j-1] {
                align += da.matchScore
            } else {
                align += da.mismatchScore
            }
            
            if dp[i][j] == align {
                aligned1.WriteByte(seq1[i-1])
                aligned2.WriteByte(seq2[j-1])
                i--
                j--
                continue
            }
        }
        
        if i > 0 && dp[i][j] == dp[i-1][j] {
            // Gap in seq2
            aligned1.WriteByte(seq1[i-1])
            aligned2.WriteByte('-')
            i--
        } else {
            // Gap in seq1
            aligned1.WriteByte('-')
            aligned2.WriteByte(seq2[j-1])
            j--
        }
    }
    
    // Reverse strings (built backwards)
    return dp[m][n], reverseString(aligned1.String()), reverseString(aligned2.String())
}

func reverseString(s string) string {
    runes := []rune(s)
    for i, j := 0, len(runes)-1; i < j; i, j = i+1, j-1 {
        runes[i], runes[j] = runes[j], runes[i]
    }
    return string(runes)
}

Problem Recognition {#problem-recognition}

LCS Pattern Indicators

type LCSPatternMatcher struct {
    patterns map[string][]string
}

func NewLCSPatternMatcher() *LCSPatternMatcher {
    return &LCSPatternMatcher{
        patterns: map[string][]string{
            "basic_lcs": {
                "common subsequence", "two strings", "longest common",
                "sequence comparison", "alignment",
            },
            "diff_tools": {
                "difference", "diff", "compare files", "version control",
                "changes", "patches",
            },
            "bioinformatics": {
                "DNA", "sequence alignment", "genome", "protein",
                "BLAST", "nucleotide", "amino acid",
            },
            "text_processing": {
                "similar documents", "plagiarism", "text similarity",
                "document comparison", "content analysis",
            },
        },
    }
}

func (lpm *LCSPatternMatcher) IdentifyApplication(description string) string {
    desc := strings.ToLower(description)
    maxScore := 0
    bestMatch := "general_lcs"
    
    for pattern, keywords := range lpm.patterns {
        score := 0
        for _, keyword := range keywords {
            if strings.Contains(desc, keyword) {
                score++
            }
        }
        
        if score > maxScore {
            maxScore = score
            bestMatch = pattern
        }
    }
    
    return bestMatch
}

LCS vs Other Sequence Problems

Problem Type Input Output Key Insight
LCS Two sequences Common subsequence Character match extends LCS
LIS One sequence Increasing subsequence Ordering constraint
Edit Distance Two sequences Minimum operations Transformation cost
Substring Two sequences Common substring Contiguity required

Conclusion

Longest Common Subsequence is a fundamental pattern in sequence analysis with wide-ranging applications. Understanding LCS provides the foundation for many advanced string algorithms and real-world text processing tasks.

LCS Mastery Framework:

🔍 Core Algorithm

  • 2D DP approach - Build table bottom-up
  • Space optimization - Use rolling arrays for O(n) space
  • Path reconstruction - Backtrack to find actual sequences

🚀 Advanced Techniques

  • Multi-sequence LCS - Extend to 3+ sequences
  • Hirschberg's algorithm - Space-efficient with path reconstruction
  • Weighted LCS - Different match values

💡 Real-World Impact

  • Diff tools - Git, SVN, file comparison utilities
  • Bioinformatics - DNA/protein sequence alignment
  • Text analysis - Plagiarism detection, similarity scoring

Key Takeaways:

  1. Foundation for many algorithms - Understanding LCS helps with edit distance, diff algorithms
  2. Space-time tradeoffs - Multiple optimization strategies available
  3. Practical applications - Powers many real-world text processing tools
  4. Extensible pattern - Easily adapted for domain-specific needs

🚀 Next Steps: With LCS mastered, you're ready to explore Edit Distance patterns that quantify the cost of transforming one sequence into another!


Next in series: Edit Distance Patterns | Previous: LIS Patterns