🎯 Mục tiêu bài học
RAG cần "đọc" documents trước khi search. Bài này cover cách load nhiều format khác nhau và xử lý chúng cho indexing.
Sau bài này, bạn sẽ:
✅ Load PDF, Word, CSV, Web pages ✅ Handle multiple formats với LangChain ✅ Extract tables, images, structured data ✅ Build robust document pipeline
📝 LangChain Document Loaders
PDF Loader
1# pip install pypdf2from langchain_community.document_loaders import PyPDFLoader34# Single PDF5loader = PyPDFLoader("data/company_report.pdf")6pages = loader.load()78print(f"Total pages: {len(pages)}")9print(f"Page 1 content: {pages[0].page_content[:200]}")10print(f"Metadata: {pages[0].metadata}")11# {'source': 'data/company_report.pdf', 'page': 0}Advanced PDF (with tables)
1# pip install unstructured[pdf]2from langchain_community.document_loaders import UnstructuredPDFLoader34# Better for complex PDFs with tables, images5loader = UnstructuredPDFLoader(6 "data/financial_report.pdf",7 mode="elements", # Split into elements (paragraphs, tables, etc.)8 strategy="hi_res" # High resolution parsing9)10elements = loader.load()1112for elem in elements[:5]:13 print(f"Type: {elem.metadata.get('category', 'unknown')}")14 print(f"Content: {elem.page_content[:100]}")15 print()Word Documents
1# pip install docx2txt2from langchain_community.document_loaders import Docx2txtLoader34loader = Docx2txtLoader("data/policy_document.docx")5docs = loader.load()6print(f"Content length: {len(docs[0].page_content)} chars")CSV / Excel
1from langchain_community.document_loaders import CSVLoader23loader = CSVLoader(4 "data/products.csv",5 csv_args={"delimiter": ","},6 source_column="product_name" # Use as source in metadata7)8docs = loader.load()910# Each row becomes a document11for doc in docs[:3]:12 print(doc.page_content[:150])13 print(doc.metadata)14 print()Web Pages
1from langchain_community.document_loaders import WebBaseLoader23# Single page4loader = WebBaseLoader("https://docs.python.org/3/tutorial/")5docs = loader.load()67# Multiple pages8loader = WebBaseLoader([9 "https://example.com/page1",10 "https://example.com/page2"11])12docs = loader.load()13print(f"Loaded {len(docs)} web pages")Recursive Web Crawl
1from langchain_community.document_loaders import RecursiveUrlLoader2from bs4 import BeautifulSoup34def bs4_extractor(html: str) -> str:5 soup = BeautifulSoup(html, "html.parser")6 return soup.get_text(separator="\n", strip=True)78loader = RecursiveUrlLoader(9 url="https://docs.example.com/",10 max_depth=2,11 extractor=bs4_extractor12)13docs = loader.load()14print(f"Crawled {len(docs)} pages")Checkpoint
Bạn đã biết cách load PDF, Word, CSV, Web pages với LangChain chưa?
📝 Directory Loading
Load Entire Folder
1from langchain_community.document_loaders import DirectoryLoader23# Load all PDFs from a directory4loader = DirectoryLoader(5 "data/knowledge_base/",6 glob="**/*.pdf",7 loader_cls=PyPDFLoader,8 show_progress=True9)10docs = loader.load()11print(f"Loaded {len(docs)} pages from all PDFs")1213# Load multiple formats14from langchain_community.document_loaders import TextLoader1516loader_txt = DirectoryLoader(17 "data/knowledge_base/",18 glob="**/*.txt",19 loader_cls=TextLoader20)Custom Document Loader
1from langchain_core.documents import Document23class CustomAPILoader:4 """Load documents from internal API."""5 6 def __init__(self, api_url, api_key):7 self.api_url = api_url8 self.api_key = api_key9 10 def load(self):11 import requests12 headers = {"Authorization": f"Bearer {self.api_key}"}13 response = requests.get(self.api_url, headers=headers)14 articles = response.json()["articles"]15 16 documents = []17 for article in articles:18 doc = Document(19 page_content=article["content"],20 metadata={21 "source": article["url"],22 "title": article["title"],23 "author": article["author"],24 "published_at": article["date"]25 }26 )27 documents.append(doc)28 29 return documents3031# Usage32loader = CustomAPILoader("https://api.company.com/articles", "key123")33docs = loader.load()Checkpoint
Bạn đã biết cách load toàn bộ folder và tạo custom document loader chưa?
💻 Document Processing Pipeline
Complete Pipeline
1import os2from pathlib import Path3from langchain_core.documents import Document45class DocumentProcessor:6 """Load and process multiple document formats."""7 8 LOADERS = {9 ".pdf": PyPDFLoader,10 ".txt": TextLoader,11 ".docx": Docx2txtLoader,12 ".csv": CSVLoader,13 }14 15 def __init__(self, data_dir):16 self.data_dir = Path(data_dir)17 18 def load_all(self):19 """Load all supported documents from directory."""20 all_docs = []21 22 for file_path in self.data_dir.rglob("*"):23 ext = file_path.suffix.lower()24 if ext in self.LOADERS:25 try:26 loader_cls = self.LOADERS[ext]27 loader = loader_cls(str(file_path))28 docs = loader.load()29 30 # Add file-level metadata31 for doc in docs:32 doc.metadata["file_name"] = file_path.name33 doc.metadata["file_type"] = ext34 doc.metadata["file_size"] = file_path.stat().st_size35 36 all_docs.extend(docs)37 print(f"Loaded: {file_path.name} ({len(docs)} chunks)")38 except Exception as e:39 print(f"Error loading {file_path.name}: {e}")40 41 print(f"\nTotal documents loaded: {len(all_docs)}")42 return all_docs43 44 def clean_documents(self, docs):45 """Clean and normalize document content."""46 cleaned = []47 for doc in docs:48 content = doc.page_content49 50 # Remove excessive whitespace51 content = " ".join(content.split())52 53 # Skip very short documents54 if len(content) < 50:55 continue56 57 # Skip duplicates58 if content in [d.page_content for d in cleaned]:59 continue60 61 doc.page_content = content62 cleaned.append(doc)63 64 print(f"After cleaning: {len(cleaned)} documents (removed {len(docs) - len(cleaned)})")65 return cleaned6667# Usage68processor = DocumentProcessor("data/knowledge_base/")69docs = processor.load_all()70docs = processor.clean_documents(docs)Checkpoint
Bạn đã xây dựng được document processing pipeline hoàn chỉnh chưa?
📝 Handling Vietnamese Documents
Vietnamese PDF Issues
1# Vietnamese PDFs often have encoding issues2# Use unstructured for better handling3from langchain_community.document_loaders import UnstructuredPDFLoader45loader = UnstructuredPDFLoader(6 "data/quy_dinh_lao_dong.pdf",7 mode="single", # Entire PDF as one document8 strategy="fast", # Fast parsing9 languages=["vie"] # Vietnamese language hint10)11docs = loader.load()1213# Verify Vietnamese text14print(docs[0].page_content[:500])Text Normalization
1import unicodedata2import re34def normalize_vietnamese(text):5 """Normalize Vietnamese text for better search."""6 # Unicode normalization (NFC form)7 text = unicodedata.normalize("NFC", text)8 9 # Fix common encoding issues10 text = text.replace("\xa0", " ") # Non-breaking space11 text = re.sub(r'\s+', ' ', text) # Multiple spaces12 13 # Remove control characters14 text = "".join(c for c in text if not unicodedata.category(c).startswith("C") or c in "\n\t")15 16 return text.strip()1718# Apply to all documents19for doc in docs:20 doc.page_content = normalize_vietnamese(doc.page_content)Checkpoint
Bạn đã biết cách xử lý Vietnamese PDFs và normalize Unicode text chưa?
🎯 Tổng kết
📝 Quiz
-
UnstructuredPDFLoader vs PyPDFLoader?
- PyPDFLoader tốt hơn luôn
- Unstructured xử lý tốt hơn complex PDFs (tables, images, layout)
- Không khác nhau
- Unstructured chỉ cho English
-
DirectoryLoader dùng để?
- Load tất cả documents từ một folder (hỗ trợ glob patterns)
- Tạo folder mới
- Chỉ load PDF
- Upload lên cloud
-
Vietnamese PDF thường gặp vấn đề gì?
- Unicode encoding, dấu tiếng Việt bị lỗi, whitespace
- Không đọc được
- Quá chậm
- Chỉ dùng được OCR
Key Takeaways
- LangChain loaders — Hỗ trợ 50+ formats
- UnstructuredPDFLoader — Best cho complex PDFs
- DirectoryLoader — Batch load entire folders
- Custom loaders — Extend cho internal APIs/sources
- Vietnamese — NFC normalization essential
Câu hỏi tự kiểm tra
- UnstructuredPDFLoader và PyPDFLoader khác nhau như thế nào và khi nào nên dùng loại nào?
- DirectoryLoader hỗ trợ những tính năng gì để load nhiều documents cùng lúc?
- Khi xử lý tài liệu tiếng Việt, cần chú ý những vấn đề gì về Unicode và encoding?
- Làm thế nào để tạo custom document loader cho nguồn dữ liệu riêng (API, database)?
🎉 Tuyệt vời! Bạn đã hoàn thành bài học Document Loaders & Formats!
Tiếp theo: Hãy cùng tìm hiểu về Chunking Strategies trong bài tiếp theo!
🚀 Bài tiếp theo
Chunking Strategies — Chia documents thành chunks tối ưu cho retrieval!
