Extract article content from news URLs using Diffbot Article API
This task extracts full article content from news URLs using the Diffbot Article API. It retrieves a news item by UUID, uses Diffbot's AI-powered content extraction service to parse the article text from the URL, and stores the extracted content along with performance metrics in the database. The task handles both new extractions and updates to existing content, providing robust error handling and detailed logging throughout the process.
The main function orchestrates a three-step content extraction workflow:
News Item Retrieval: Calls the getNewsItem boundary function to fetch the news item by UUID and validate its existence from MongoDB database
Content Extraction: Uses the extractContentWithDiffbot boundary function to process the news URL through Diffbot's Article API, which uses AI to identify and extract clean article text from web pages from Diffbot API service
Content Storage: Calls saveToNewsContent boundary function to either create new or update existing NewsContent records with the extracted text and performance metrics to MongoDB database
The task includes comprehensive error handling that captures failed extractions with metrics, detailed console logging for monitoring, and content preview functionality for verification. If extraction fails, the task still attempts to save failure metrics before re-throwing the error for proper task failure handling.
// TASK: extractContent
// Run this task with:
// forge task:run diffbot:extractContent --uuid="[NEWS_UUID]"
import { createTask } from '@forgehive/task'
import { Schema } from '@forgehive/schema'
import { NewsModel, NewsContentModel } from '../../models/index.js'
const name = 'diffbot:extractContent'
const description = 'Extract article content from news URLs using Diffbot Article API'
const schema = new Schema({
uuid: Schema.string()
})
const boundaries = {
getNewsItem: async (uuid: string) => {
return await NewsModel.findOne({ uuid }).lean()
},
saveToNewsContent: async (newsUuid: string, url: string, extractedContent: string, extractionMetrics: any) => {
// Check if content already exists for this provider and news item
const existingContent = await NewsContentModel.findOne({
newsUuid,
provider: 'diffbot'
})
if (existingContent) {
console.log(`📝 [DIFFBOT] Updating existing NewsContent: ${existingContent.uuid}`)
// Update existing content
existingContent.extractedContent = extractedContent
existingContent.extractionMetrics = extractionMetrics
existingContent.updatedAt = new Date()
return await existingContent.save()
} else {
console.log(`📝 [DIFFBOT] Creating new NewsContent`)
// Create new content
const newsContent = new NewsContentModel({
newsUuid,
provider: 'diffbot',
extractedContent,
extractionMetrics
})
return await newsContent.save()
}
},
extractContentWithDiffbot: async (url: string) => {
const token = process.env.DIFFBOT_API_TOKEN
if (!token) {
throw new Error('DIFFBOT_API_TOKEN environment variable is required')
}
const startTime = Date.now()
try {
console.log(`🔍 [DIFFBOT] Extracting content from: ${url}`)
const payload = { url }
const searchParams = new URLSearchParams(payload)
const response = await fetch(
`https://api.diffbot.com/v3/article?token=${token}&${searchParams.toString()}`,
{
method: 'GET',
headers: {}
}
)
if (!response.ok) {
throw new Error(`Diffbot API error: ${response.status} ${response.statusText}`)
}
const result = await response.json()
const extractionDuration = Date.now() - startTime
console.log(`📊 [DIFFBOT] API Response:`, JSON.stringify(result, null, 2))
// Handle Article API response format
let extractedContent = ''
if (result.objects && result.objects.length > 0) {
// Standard objects response
const contentObject = result.objects[0]
if (contentObject.text) {
extractedContent = contentObject.text
} else if (contentObject.html) {
extractedContent = contentObject.html
}
} else if (result.text) {
// Direct text response
extractedContent = result.text
} else if (result.html) {
// Direct HTML response
extractedContent = result.html
}
if (!extractedContent || extractedContent.trim().length === 0) {
throw new Error(`No text content found in Diffbot response. Response structure: ${JSON.stringify(Object.keys(result))}`)
}
// Extract domain from URL for metrics
const domain = new URL(url).hostname
return {
content: extractedContent,
extractionMetrics: {
contentLength: extractedContent.length,
extractionDuration,
extractedAt: new Date(),
domain,
success: true
}
}
} catch (error) {
console.error(`❌ [DIFFBOT] Extraction failed:`, error)
const extractionDuration = Date.now() - startTime
const domain = new URL(url).hostname
throw {
error: error instanceof Error ? error.message : 'Unknown error',
extractionMetrics: {
contentLength: 0,
extractionDuration,
extractedAt: new Date(),
domain,
success: false,
errorMessage: error instanceof Error ? error.message : 'Unknown error'
}
}
}
}
}
export const extractContent = createTask({
name,
description,
schema,
boundaries,
fn: async function ({ uuid }, { getNewsItem, saveToNewsContent, extractContentWithDiffbot }) {
console.log(`🤖 [DIFFBOT] Starting content extraction for news item: ${uuid}`)
// Get the news item
const newsItem = await getNewsItem(uuid)
if (!newsItem) {
throw new Error(`News item not found with UUID: ${uuid}`)
}
console.log(`📄 [DIFFBOT] Found news item: "${newsItem.title}"`)
console.log(`🔗 [DIFFBOT] URL: ${newsItem.url}`)
try {
// Extract content using Diffbot
const extraction = await extractContentWithDiffbot(newsItem.url)
// Save to NewsContent collection
const newsContent = await saveToNewsContent(
newsItem.uuid,
newsItem.url,
extraction.content,
extraction.extractionMetrics
)
console.log(`✅ [DIFFBOT] Successfully extracted ${extraction.content.length} characters`)
console.log(`📝 [DIFFBOT] NewsContent UUID: ${newsContent.uuid} (${newsContent.isNew ? 'created' : 'updated'})`)
console.log(`🔍 [DIFFBOT] Content preview: ${extraction.content.substring(0, 200)}...`)
return {
success: true,
message: `Diffbot extraction completed for: ${newsItem.title}`,
data: {
newsUuid: newsItem.uuid,
newsContentUuid: newsContent.uuid,
provider: 'diffbot',
contentLength: extraction.content.length,
extractionDuration: extraction.extractionMetrics.extractionDuration,
domain: extraction.extractionMetrics.domain
}
}
} catch (error: any) {
console.error(`❌ [DIFFBOT] Failed to extract content for ${uuid}:`, error)
// If we have extraction metrics from the error, save failed attempt
if (error.extractionMetrics) {
try {
await saveToNewsContent(
newsItem.uuid,
newsItem.url,
'', // Empty content for failed extraction
error.extractionMetrics
)
console.log(`📝 [DIFFBOT] Saved failed extraction metrics to NewsContent`)
} catch (saveError) {
console.error(`❌ [DIFFBOT] Failed to save error metrics:`, saveError)
}
}
// Re-throw the error for proper task failure handling
throw error.error || error
}
}
})