Basic Summarization
Simple Text Summarization
Copy
from openai import OpenAI
def summarize_text(
text: str,
max_length: str = "medium",
style: str = "informative"
) -> str:
"""Summarize text with configurable length and style."""
client = OpenAI()
length_instructions = {
"brief": "in 1-2 sentences",
"medium": "in a short paragraph (3-5 sentences)",
"detailed": "in 2-3 paragraphs with key details"
}
style_instructions = {
"informative": "Focus on key facts and information",
"executive": "Focus on decisions, actions, and business implications",
"casual": "Use a conversational, easy-to-read tone",
"technical": "Preserve technical details and terminology"
}
prompt = f"""Summarize the following text {length_instructions.get(max_length, "in a paragraph")}.
{style_instructions.get(style, "")}
Text:
{text}
Summary:"""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
# Usage
article = """
Artificial intelligence has made remarkable strides in recent years,
particularly in the field of natural language processing. Large language
models can now write code, analyze documents, and engage in nuanced
conversations. These advances have led to widespread adoption in enterprise
settings, with companies using AI for customer service, content creation,
and data analysis. However, concerns about accuracy, bias, and job
displacement continue to spark debate among policymakers and technologists.
"""
summary = summarize_text(article, max_length="brief", style="executive")
print(summary)
Extractive vs Abstractive Summarization
Copy
from openai import OpenAI
import json
class Summarizer:
"""Multi-mode summarization system."""
def __init__(self, model: str = "gpt-4o-mini"):
self.client = OpenAI()
self.model = model
def extractive_summary(
self,
text: str,
num_sentences: int = 3
) -> list[str]:
"""Extract the most important sentences from text."""
prompt = f"""Identify the {num_sentences} most important sentences from this text.
Return them exactly as they appear, preserving the original wording.
Text:
{text}
Return as JSON: {{"sentences": ["sentence 1", "sentence 2", ...]}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
return data.get("sentences", [])
def abstractive_summary(
self,
text: str,
target_length: int = 100
) -> str:
"""Generate a new summary capturing key ideas."""
prompt = f"""Write a summary of approximately {target_length} words.
Capture the main ideas in your own words. Be concise and clear.
Text:
{text}
Summary:"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
def hybrid_summary(
self,
text: str,
extract_count: int = 3,
abstract_length: int = 100
) -> dict:
"""Combine extractive and abstractive summarization."""
# Get key sentences
key_sentences = self.extractive_summary(text, extract_count)
# Generate abstract summary
abstract = self.abstractive_summary(text, abstract_length)
return {
"key_sentences": key_sentences,
"summary": abstract,
"method": "hybrid"
}
# Usage
summarizer = Summarizer()
text = """
The global transition to renewable energy is accelerating faster than many
analysts predicted. Solar and wind power now account for a significant
portion of new electricity generation capacity worldwide. Investment in
clean energy reached record levels last year, surpassing fossil fuel
investments for the first time. Major automakers have announced ambitious
plans to phase out internal combustion engines within the next decade.
Despite this progress, challenges remain, including grid infrastructure
upgrades, energy storage solutions, and ensuring a just transition for
workers in traditional energy sectors.
"""
# Extractive
sentences = summarizer.extractive_summary(text, num_sentences=2)
print("Key sentences:")
for s in sentences:
print(f" - {s}")
# Abstractive
summary = summarizer.abstractive_summary(text, target_length=50)
print(f"\nAbstract summary:\n{summary}")
Long Document Summarization
Map-Reduce Pattern
Copy
from openai import OpenAI
from dataclasses import dataclass
@dataclass
class DocumentChunk:
"""A chunk of a document."""
index: int
text: str
summary: str = None
class LongDocumentSummarizer:
"""Summarize long documents using map-reduce."""
def __init__(
self,
chunk_size: int = 3000,
model: str = "gpt-4o-mini"
):
self.client = OpenAI()
self.chunk_size = chunk_size
self.model = model
def _chunk_text(self, text: str) -> list[DocumentChunk]:
"""Split text into chunks."""
words = text.split()
chunks = []
current_chunk = []
current_length = 0
chunk_index = 0
for word in words:
current_chunk.append(word)
current_length += len(word) + 1
if current_length >= self.chunk_size:
chunks.append(DocumentChunk(
index=chunk_index,
text=" ".join(current_chunk)
))
current_chunk = []
current_length = 0
chunk_index += 1
if current_chunk:
chunks.append(DocumentChunk(
index=chunk_index,
text=" ".join(current_chunk)
))
return chunks
def _summarize_chunk(self, chunk: DocumentChunk) -> str:
"""Summarize a single chunk."""
prompt = f"""Summarize this section of a document.
Capture all key points and important details.
Section {chunk.index + 1}:
{chunk.text}
Summary:"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
def _combine_summaries(
self,
summaries: list[str],
final_length: str = "medium"
) -> str:
"""Combine chunk summaries into final summary."""
combined = "\n\n".join([
f"Section {i+1}: {s}" for i, s in enumerate(summaries)
])
length_guide = {
"brief": "2-3 sentences",
"medium": "1-2 paragraphs",
"detailed": "3-4 paragraphs with key details"
}
prompt = f"""Create a cohesive summary from these section summaries.
Target length: {length_guide.get(final_length, "1-2 paragraphs")}
Section summaries:
{combined}
Final summary:"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
def summarize(
self,
document: str,
final_length: str = "medium"
) -> dict:
"""Summarize a long document."""
# Map: chunk and summarize each
chunks = self._chunk_text(document)
for chunk in chunks:
chunk.summary = self._summarize_chunk(chunk)
# Reduce: combine summaries
chunk_summaries = [c.summary for c in chunks]
final_summary = self._combine_summaries(chunk_summaries, final_length)
return {
"final_summary": final_summary,
"chunk_count": len(chunks),
"chunk_summaries": chunk_summaries
}
# Usage
summarizer = LongDocumentSummarizer(chunk_size=2000)
# Long document (imagine this is much longer)
document = """[Long document text here...]"""
result = summarizer.summarize(document, final_length="medium")
print(f"Processed {result['chunk_count']} chunks")
print(f"\nFinal Summary:\n{result['final_summary']}")
Hierarchical Summarization
Copy
from openai import OpenAI
from dataclasses import dataclass, field
@dataclass
class Section:
"""A document section with hierarchy."""
title: str
content: str
level: int
children: list = field(default_factory=list)
summary: str = None
class HierarchicalSummarizer:
"""Summarize documents with hierarchical structure."""
def __init__(self, model: str = "gpt-4o-mini"):
self.client = OpenAI()
self.model = model
def parse_sections(self, document: str) -> list[Section]:
"""Parse document into sections using LLM."""
prompt = f"""Analyze this document and identify its sections.
For each section, identify the title, content, and hierarchy level.
Document:
{document[:4000]}...
Return as JSON:
{{
"sections": [
{{"title": "Section Title", "content": "section content...", "level": 1}}
]
}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
import json
data = json.loads(response.choices[0].message.content)
return [
Section(
title=s["title"],
content=s["content"],
level=s["level"]
)
for s in data.get("sections", [])
]
def summarize_section(
self,
section: Section,
child_summaries: list[str] = None
) -> str:
"""Summarize a section, incorporating child summaries if present."""
context = ""
if child_summaries:
context = "\n\nSubsection summaries:\n" + "\n".join(child_summaries)
prompt = f"""Summarize this section of a document.
{context}
Section: {section.title}
Content: {section.content}
Summary:"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
def summarize_document(self, document: str) -> dict:
"""Create hierarchical summary of document."""
sections = self.parse_sections(document)
# Summarize from leaves up
for section in reversed(sections):
child_summaries = [c.summary for c in section.children if c.summary]
section.summary = self.summarize_section(section, child_summaries)
# Create executive summary from top-level sections
top_level = [s for s in sections if s.level == 1]
exec_prompt = f"""Create an executive summary from these section summaries:
{chr(10).join(f"- {s.title}: {s.summary}" for s in top_level)}
Executive Summary:"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": exec_prompt}]
)
return {
"executive_summary": response.choices[0].message.content,
"sections": [
{"title": s.title, "summary": s.summary}
for s in sections
]
}
Meeting Summarization
Copy
from openai import OpenAI
from dataclasses import dataclass
import json
@dataclass
class MeetingSummary:
"""Structured meeting summary."""
title: str
date: str
attendees: list[str]
summary: str
key_points: list[str]
decisions: list[str]
action_items: list[dict]
next_steps: list[str]
class MeetingSummarizer:
"""Summarize meeting transcripts."""
def __init__(self, model: str = "gpt-4o"):
self.client = OpenAI()
self.model = model
def summarize(self, transcript: str, meeting_info: dict = None) -> MeetingSummary:
"""Create a comprehensive meeting summary."""
meeting_info = meeting_info or {}
prompt = f"""Analyze this meeting transcript and create a comprehensive summary.
Meeting Info:
- Title: {meeting_info.get('title', 'Unknown')}
- Date: {meeting_info.get('date', 'Unknown')}
- Attendees: {', '.join(meeting_info.get('attendees', ['Unknown']))}
Transcript:
{transcript}
Provide a detailed analysis as JSON:
{{
"summary": "2-3 paragraph summary of the meeting",
"key_points": ["main points discussed"],
"decisions": ["decisions that were made"],
"action_items": [
{{"task": "description", "assignee": "person", "due_date": "if mentioned"}}
],
"next_steps": ["follow-up items and next steps"],
"topics_discussed": ["list of topics covered"]
}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
data = json.loads(response.choices[0].message.content)
return MeetingSummary(
title=meeting_info.get("title", "Untitled Meeting"),
date=meeting_info.get("date", "Unknown"),
attendees=meeting_info.get("attendees", []),
summary=data.get("summary", ""),
key_points=data.get("key_points", []),
decisions=data.get("decisions", []),
action_items=data.get("action_items", []),
next_steps=data.get("next_steps", [])
)
def format_as_markdown(self, summary: MeetingSummary) -> str:
"""Format meeting summary as markdown."""
lines = [
f"# {summary.title}",
f"**Date:** {summary.date}",
f"**Attendees:** {', '.join(summary.attendees)}",
"",
"## Summary",
summary.summary,
"",
"## Key Points",
]
for point in summary.key_points:
lines.append(f"- {point}")
lines.extend(["", "## Decisions"])
for decision in summary.decisions:
lines.append(f"- {decision}")
lines.extend(["", "## Action Items"])
for item in summary.action_items:
assignee = item.get("assignee", "Unassigned")
due = f" (Due: {item['due_date']})" if item.get("due_date") else ""
lines.append(f"- [ ] **{assignee}**: {item['task']}{due}")
lines.extend(["", "## Next Steps"])
for step in summary.next_steps:
lines.append(f"- {step}")
return "\n".join(lines)
# Usage
summarizer = MeetingSummarizer()
transcript = """
Alice: Good morning everyone. Let's start with the Q3 planning meeting.
Bob: Thanks Alice. I wanted to discuss the new feature rollout timeline.
Alice: Sure. We need to finalize the launch date for the mobile app update.
Bob: I propose we aim for October 15th. That gives us three weeks for testing.
Carol: I agree with that timeline. My team can handle the QA by then.
Alice: Great, so we're agreed on October 15th. Bob, can you update the roadmap?
Bob: Will do. I'll have that ready by end of day tomorrow.
Carol: I'll also need design specs from David's team by next Monday.
Alice: David, can you make that happen?
David: Yes, I'll prioritize it. We'll have the designs ready by Monday morning.
Alice: Perfect. Any other items before we wrap up?
Bob: Just a reminder about the stakeholder meeting next Thursday.
Alice: Thanks Bob. Let's adjourn and reconvene next week for a progress check.
"""
summary = summarizer.summarize(
transcript,
{
"title": "Q3 Planning Meeting",
"date": "September 25, 2024",
"attendees": ["Alice", "Bob", "Carol", "David"]
}
)
markdown = summarizer.format_as_markdown(summary)
print(markdown)
Streaming Summarization
Copy
from openai import OpenAI
def stream_summary(
text: str,
max_length: int = 200
) -> str:
"""Stream a summary as it's generated."""
client = OpenAI()
prompt = f"""Summarize this text in approximately {max_length} words:
{text}
Summary:"""
stream = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
stream=True
)
full_response = ""
for chunk in stream:
if chunk.choices[0].delta.content:
content = chunk.choices[0].delta.content
print(content, end="", flush=True)
full_response += content
print() # Newline at end
return full_response
class ProgressiveSummarizer:
"""Generate summaries at multiple detail levels progressively."""
def __init__(self, model: str = "gpt-4o-mini"):
self.client = OpenAI()
self.model = model
def summarize_progressive(
self,
text: str,
levels: list[str] = None
):
"""Generate summaries at increasing detail levels."""
levels = levels or ["one-sentence", "paragraph", "detailed"]
level_prompts = {
"one-sentence": "Summarize in exactly one sentence.",
"paragraph": "Summarize in one paragraph (3-5 sentences).",
"detailed": "Provide a detailed summary with key points (2-3 paragraphs)."
}
for level in levels:
instruction = level_prompts.get(level, level)
print(f"\n--- {level.upper()} ---")
stream = self.client.chat.completions.create(
model=self.model,
messages=[{
"role": "user",
"content": f"{instruction}\n\nText:\n{text}"
}],
stream=True
)
for chunk in stream:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="", flush=True)
print("\n")
yield level
# Usage
summarizer = ProgressiveSummarizer()
text = """[Your long text here]"""
# Generate progressive summaries
for level in summarizer.summarize_progressive(text):
pass # Summaries are printed as they stream
Content-Specific Summarization
Copy
from openai import OpenAI
import json
class ContentSummarizer:
"""Summarize different types of content with specialized prompts."""
def __init__(self, model: str = "gpt-4o-mini"):
self.client = OpenAI()
self.model = model
def summarize_news(self, article: str) -> dict:
"""Summarize a news article."""
prompt = f"""Summarize this news article:
{article}
Provide as JSON:
{{
"headline": "suggested headline",
"summary": "2-3 sentence summary",
"key_facts": ["list of key facts"],
"who": "key people/organizations involved",
"what": "what happened",
"when": "when it happened",
"where": "where it happened",
"why": "why it matters"
}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
def summarize_research(self, paper: str) -> dict:
"""Summarize a research paper or technical document."""
prompt = f"""Summarize this research document:
{paper}
Provide as JSON:
{{
"title": "paper title if identifiable",
"abstract": "brief abstract/overview",
"problem": "problem being addressed",
"methodology": "approach/methods used",
"findings": ["key findings"],
"contributions": ["main contributions"],
"limitations": ["noted limitations"],
"future_work": ["suggested future directions"],
"practical_implications": "real-world applications"
}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
def summarize_email_thread(self, emails: list[dict]) -> dict:
"""Summarize an email thread."""
thread_text = "\n\n".join([
f"From: {e['from']}\nTo: {e['to']}\nDate: {e['date']}\n\n{e['body']}"
for e in emails
])
prompt = f"""Summarize this email thread:
{thread_text}
Provide as JSON:
{{
"subject": "thread subject",
"summary": "brief summary of the discussion",
"participants": ["list of participants"],
"key_points": ["main points discussed"],
"requests": ["any requests made"],
"decisions": ["any decisions reached"],
"pending_items": ["items still pending"],
"sentiment": "overall tone of the thread"
}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
def summarize_code(self, code: str, language: str = "python") -> dict:
"""Summarize code functionality."""
prompt = f"""Summarize this {language} code:
```python
{code}
```
Provide as JSON:
{{
"purpose": "what the code does",
"summary": "brief functional summary",
"main_components": ["key functions/classes"],
"inputs": ["expected inputs"],
"outputs": ["expected outputs"],
"dependencies": ["external dependencies used"],
"complexity": "simple/moderate/complex",
"potential_issues": ["any noted concerns"]
}}"""
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
return json.loads(response.choices[0].message.content)
# Usage
summarizer = ContentSummarizer()
# News article
news_summary = summarizer.summarize_news("...")
print(f"Headline: {news_summary['headline']}")
print(f"Summary: {news_summary['summary']}")
# Research paper
research_summary = summarizer.summarize_research("...")
print(f"Problem: {research_summary['problem']}")
print(f"Findings: {research_summary['findings']}")
Summarization Best Practices
- Choose extractive for preserving exact quotes and facts
- Use abstractive for readable, flowing summaries
- Apply map-reduce for documents exceeding context limits
- Tailor prompts to content type for better results
- Validate summaries against source for accuracy
Practice Exercise
Build a summarization service that:- Handles multiple content types (articles, meetings, code)
- Supports variable length outputs
- Preserves key information and accuracy
- Provides structured output with metadata
- Scales to long documents using chunking
- Accuracy preservation in summaries
- Appropriate level of detail
- Coherent multi-chunk summarization
- Specialized handling for different content types