Fuzzy Searching
Achieving Better Answers with Fuzzy Searching di PostgreSQL
π 1. Latar Belakang
Pengguna modern tidak puas hanya dengan pencarian exact match (WHERE name = 'Sammi'
).
Mereka terbiasa dengan Google-like search:
Toleransi terhadap typo / kesalahan ketik.
Menemukan hasil meski query salah.
Mendapat saran mirip βDid you mean β¦ ?β.
Untuk mencapai hal itu di PostgreSQL β kita gunakan fuzzy searching dengan ekstensi pg_trgm.
π 2. Apa itu pg_trgm
?
pg_trgm
?pg_trgm
(PostgreSQL Trigram) adalah ekstensi untuk fuzzy string matching.
Membagi string menjadi trigram = potongan substring sepanjang 3 karakter.
Kemiripan dihitung berdasarkan jumlah trigram yang sama.
PostgreSQL menyediakan operator khusus untuk fuzzy search:
<->
Jarak antar string (0..1, lebih kecil lebih mirip)
<#>
Mirip <->
tapi berbasis word similarity
%
Boolean: true
jika string mirip dengan threshold tertentu
ILIKE
/ LIKE
Didukung penuh dengan trigram index
~
Regex search (juga bisa dioptimalkan dengan trigram index)
π 3. Instalasi & Setup
Aktifkan ekstensi:
CREATE EXTENSION pg_trgm;
Contoh data:
CREATE TABLE t_location (name text);
COPY t_location FROM PROGRAM
'curl https://www.cybertec-postgresql.com/secret/orte.txt';
π 4. Cara Kerja Trigram
Contoh string: "abcdef"
Trigram yang dihasilkan:
SELECT show_trgm('abcdef');
π {" a"," ab",abc,bcd,cde,def,"ef "}
Jadi string diubah jadi potongan-potongan 3 karakter (dengan padding spasi di awal/akhir). Kemiripan string dihitung dari jumlah trigram yang overlap.
π 5. Contoh Fuzzy Search
Misalnya user salah ketik "Kramertneusiedel"
padahal nama yang benar "Gramatneusiedl"
:
SELECT *
FROM t_location
ORDER BY name <-> 'Kramertneusiedel'
LIMIT 3;
Hasil:
Gramatneusiedl
Klein-Neusiedl
Potzneusiedl
π PostgreSQL otomatis menemukan yang paling mirip, bukan exact match.
π 6. Indexing untuk Performa
Tanpa index β pencarian fuzzy butuh full scan β lambat di dataset besar. Solusinya: buat index khusus trigram.
GiST index:
CREATE INDEX idx_trgm_gist ON t_location USING gist (name gist_trgm_ops);
GIN index (lebih efisien untuk dataset besar):
CREATE INDEX idx_trgm_gin ON t_location USING gin (name gin_trgm_ops);
π 7. Fuzzy Search vs LIKE
Normal LIKE
sangat lambat:
SELECT * FROM t_location WHERE name LIKE '%neusi%';
π Biasanya menyebabkan sequential scan.
Tapi dengan trigram index:
EXPLAIN SELECT *
FROM t_location
WHERE name LIKE '%neusi%';
Query plan akan menunjukkan penggunaan Bitmap Index Scan β jauh lebih cepat.
π 8. Fuzzy Search dengan Regex
Bahkan regex bisa dipercepat oleh trigram index:
SELECT *
FROM t_location
WHERE name ~ '[A-C].*neu.*';
Query plan:
Index Scan using idx_trgm_gin on t_location
Index Cond: (name ~ '[A-C].*neu.*'::text)
π PostgreSQL memahami pola regex dan menggunakan trigram index untuk mempercepatnya.
π 9. Threshold & Fine-Tuning
Kita bisa atur threshold kemiripan (0..1
) untuk operator %
.
-- Cek threshold default
SHOW pg_trgm.similarity_threshold;
-- Atur threshold ke 0.3 (lebih permisif)
SET pg_trgm.similarity_threshold = 0.3;
-- Cari string mirip
SELECT *
FROM t_location
WHERE name % 'neusid';
π 10. Use Cases Real World
Search bar aplikasi User salah ketik
"Jakrta"
β hasil"Jakarta"
.Autocomplete User ketik
"Ban"
β saran"Bandung"
,"Banjarmasin"
.Did you mean β¦? User ketik
"gramatneusid"
β rekomendasi"Gramatneusiedl"
.Fuzzy matching data duplikat Cleaning dataset alamat, nama user, atau entitas bisnis.
Percepat LIKE / ILIKE Query
ILIKE '%keyword%'
di tabel besar bisa jadi super cepat dengan trigram index.
π 11. Ringkasan Kelebihan pg_trgm
pg_trgm
β Menyediakan fuzzy search (did you mean, typo tolerance). β Bisa digunakan dengan LIKE, ILIKE, Regex. β Bisa diindeks dengan GiST / GIN β performa tinggi. β Sangat cocok untuk search bar modern.
β οΈ Kekurangan:
Index trigram lumayan besar (karena menyimpan banyak kombinasi trigram).
Bisa salah memberi hasil βmiripβ (false positive) β harus di-tuning dengan threshold.
π Jadi, dengan pg_trgm + GIN/GiST index, PostgreSQL bisa berubah dari sekadar RDBMS biasa β menjadi mesin pencarian mirip Google untuk teks, lengkap dengan fuzzy search, LIKE acceleration, dan regex optimization.
Contoh Program
package main
import (
"encoding/json"
"log"
"net/http"
"os"
"strconv"
"time"
"context"
"fmt"
"github.com/go-chi/chi/middleware"
"github.com/go-chi/chi/v5"
"github.com/jackc/pgx/v5/pgxpool"
)
func main() {
// Get database connection string from environment variable
dbConnStr := os.Getenv("DATABASE_URL")
if dbConnStr == "" {
dbConnStr = "postgres://postgres:password@localhost:5432/article_management?sslmode=disable"
}
// Initialize database connection
db, err := NewDB(dbConnStr)
if err != nil {
log.Fatalf("Failed to connect to database: %v", err)
}
defer db.Close()
// Initialize handlers
articleHandler := NewArticleHandler(db)
// Setup router
r := chi.NewRouter()
r.Use(middleware.RequestID)
r.Use(middleware.RealIP)
r.Use(middleware.Logger)
r.Use(middleware.Recoverer)
r.Use(middleware.AllowContentType("application/json"))
// Register routes
articleHandler.RegisterRoutes(r)
// Add a simple health check endpoint
r.Get("/health", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
w.Write([]byte("OK"))
})
// Start server
port := os.Getenv("PORT")
if port == "" {
port = "8080"
}
fmt.Printf("Server starting on port %s...\n", port)
log.Fatal(http.ListenAndServe(":"+port, r))
}
type ArticleHandler struct {
db *DB
}
func NewArticleHandler(db *DB) *ArticleHandler {
return &ArticleHandler{db: db}
}
func (h *ArticleHandler) RegisterRoutes(r chi.Router) {
r.Route("/articles", func(r chi.Router) {
r.Post("/", h.CreateArticle)
r.Get("/", h.SearchArticles)
r.Get("/{id}", h.GetArticle)
})
}
// CreateArticle handles creating a new article
func (h *ArticleHandler) CreateArticle(w http.ResponseWriter, r *http.Request) {
var req CreateArticleRequest
if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
http.Error(w, "Invalid request payload", http.StatusBadRequest)
return
}
// Validate input
if req.Title == "" || req.Content == "" || req.Author == "" {
http.Error(w, "Title, content, and author are required", http.StatusBadRequest)
return
}
ctx := context.Background()
pool := h.db.GetPool()
var articleID int
var createdAt time.Time
err := pool.QueryRow(ctx, `
INSERT INTO articles (title, content, author)
VALUES ($1, $2, $3)
RETURNING id, created_at
`, req.Title, req.Content, req.Author).Scan(&articleID, &createdAt)
if err != nil {
http.Error(w, fmt.Sprintf("Failed to create article: %v", err), http.StatusInternalServerError)
return
}
article := Article{
ID: articleID,
Title: req.Title,
Content: req.Content,
Author: req.Author,
CreatedAt: createdAt,
}
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(http.StatusCreated)
json.NewEncoder(w).Encode(article)
}
// GetArticle retrieves a single article by ID
func (h *ArticleHandler) GetArticle(w http.ResponseWriter, r *http.Request) {
idStr := chi.URLParam(r, "id")
id, err := strconv.Atoi(idStr)
if err != nil {
http.Error(w, "Invalid article ID", http.StatusBadRequest)
return
}
ctx := context.Background()
pool := h.db.GetPool()
var article Article
err = pool.QueryRow(ctx, `
SELECT id, title, content, author, created_at
FROM articles
WHERE id = $1
`, id).Scan(&article.ID, &article.Title, &article.Content, &article.Author, &article.CreatedAt)
if err != nil {
http.Error(w, "Article not found", http.StatusNotFound)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(article)
}
// SearchArticles performs fuzzy search on articles
func (h *ArticleHandler) SearchArticles(w http.ResponseWriter, r *http.Request) {
query := r.URL.Query().Get("query")
if query == "" {
http.Error(w, "Query parameter is required", http.StatusBadRequest)
return
}
// Get pagination parameters with defaults
limitStr := r.URL.Query().Get("limit")
limit := 10 // default limit
if limitStr != "" {
var err error
limit, err = strconv.Atoi(limitStr)
if err != nil || limit <= 0 || limit > 100 {
limit = 10
}
}
offsetStr := r.URL.Query().Get("offset")
offset := 0 // default offset
if offsetStr != "" {
var err error
offset, err = strconv.Atoi(offsetStr)
if err != nil || offset < 0 {
offset = 0
}
}
ctx := context.Background()
pool := h.db.GetPool()
// Using pg_trgm for fuzzy search with similarity ranking
rows, err := pool.Query(ctx, `
SELECT id, title, content, author, created_at,
-- Calculate similarity for title
(similarity(title, $1) * 0.6 +
-- Calculate similarity for content
similarity(content, $1) * 0.3 +
-- Calculate similarity for author
similarity(author, $1) * 0.1) AS relevance
FROM articles
WHERE
title % $1 OR
content % $1 OR
author % $1
ORDER BY relevance DESC
LIMIT $2 OFFSET $3
`, query, limit, offset)
if err != nil {
http.Error(w, fmt.Sprintf("Failed to search articles: %v", err), http.StatusInternalServerError)
return
}
defer rows.Close()
var articles []Article
for rows.Next() {
var article Article
var relevance float64
err := rows.Scan(
&article.ID,
&article.Title,
&article.Content,
&article.Author,
&article.CreatedAt,
&relevance,
)
if err != nil {
http.Error(w, fmt.Sprintf("Failed to scan article: %v", err), http.StatusInternalServerError)
return
}
articles = append(articles, article)
}
if err := rows.Err(); err != nil {
http.Error(w, fmt.Sprintf("Error iterating articles: %v", err), http.StatusInternalServerError)
return
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(articles)
}
type DB struct {
pool *pgxpool.Pool
}
func NewDB(connString string) (*DB, error) {
poolConfig, err := pgxpool.ParseConfig(connString)
if err != nil {
return nil, fmt.Errorf("unable to parse database config: %v", err)
}
pool, err := pgxpool.NewWithConfig(context.Background(), poolConfig)
if err != nil {
return nil, fmt.Errorf("unable to create connection pool: %v", err)
}
// Test the connection
if err := pool.Ping(context.Background()); err != nil {
return nil, fmt.Errorf("unable to ping database: %v", err)
}
// Create tables if they don't exist
if err := createTables(pool); err != nil {
return nil, fmt.Errorf("unable to create tables: %v", err)
}
return &DB{pool: pool}, nil
}
func createTables(pool *pgxpool.Pool) error {
ctx := context.Background()
// Create articles table
_, err := pool.Exec(ctx, `
CREATE TABLE IF NOT EXISTS articles (
id SERIAL PRIMARY KEY,
title VARCHAR(255) NOT NULL,
content TEXT NOT NULL,
author VARCHAR(100) NOT NULL,
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
)
`)
if err != nil {
return fmt.Errorf("failed to create articles table: %v", err)
}
// Create pg_trgm extension
_, err = pool.Exec(ctx, "CREATE EXTENSION IF NOT EXISTS pg_trgm")
if err != nil {
return fmt.Errorf("failed to create pg_trgm extension: %v", err)
}
// Create GIN indexes for fuzzy search
_, err = pool.Exec(ctx, `
CREATE INDEX IF NOT EXISTS idx_articles_title_gin
ON articles USING gin (title gin_trgm_ops)
`)
if err != nil {
return fmt.Errorf("failed to create title index: %v", err)
}
_, err = pool.Exec(ctx, `
CREATE INDEX IF NOT EXISTS idx_articles_content_gin
ON articles USING gin (content gin_trgm_ops)
`)
if err != nil {
return fmt.Errorf("failed to create content index: %v", err)
}
_, err = pool.Exec(ctx, `
CREATE INDEX IF NOT EXISTS idx_articles_author_gin
ON articles USING gin (author gin_trgm_ops)
`)
if err != nil {
return fmt.Errorf("failed to create author index: %v", err)
}
return nil
}
func (db *DB) Close() {
db.pool.Close()
}
func (db *DB) GetPool() *pgxpool.Pool {
return db.pool
}
type Article struct {
ID int `json:"id"`
Title string `json:"title"`
Content string `json:"content"`
Author string `json:"author"`
CreatedAt time.Time `json:"created_at"`
}
type CreateArticleRequest struct {
Title string `json:"title"`
Content string `json:"content"`
Author string `json:"author"`
}
type SearchArticleRequest struct {
Query string `json:"query"`
Limit int `json:"limit"`
Offset int `json:"offset"`
}
POST http://localhost:8080/articles
Content-Type: application/json
{
"title": "Introduction to Fuzzy Search",
"content": "Fuzzy search is a technique that finds strings which are approximately equal to a given string...",
"author": "John Doe"
}
###
GET http://localhost:8080/articles?query=sear introduc
Last updated