diffbot:extractContent by Hive

Overview

This task extracts full article content from news URLs using the Diffbot Article API. It retrieves a news item by UUID, uses Diffbot's AI-powered content extraction service to parse the article text from the URL, and stores the extracted content along with performance metrics in the database. The task handles both new extractions and updates to existing content, providing robust error handling and detailed logging throughout the process.

Implementation

The main function orchestrates a three-step content extraction workflow:

News Item Retrieval: Calls the getNewsItem boundary function to fetch the news item by UUID and validate its existence from MongoDB database
Content Extraction: Uses the extractContentWithDiffbot boundary function to process the news URL through Diffbot's Article API, which uses AI to identify and extract clean article text from web pages from Diffbot API service
Content Storage: Calls saveToNewsContent boundary function to either create new or update existing NewsContent records with the extracted text and performance metrics to MongoDB database

The task includes comprehensive error handling that captures failed extractions with metrics, detailed console logging for monitoring, and content preview functionality for verification. If extraction fails, the task still attempts to save failure metrics before re-throwing the error for proper task failure handling.

// TASK: extractContent // Run this task with: // forge task:run diffbot:extractContent --uuid="[NEWS_UUID]" import { createTask } from '@forgehive/task' import { Schema } from '@forgehive/schema' import { NewsModel, NewsContentModel } from '../../models/index.js' const name = 'diffbot:extractContent' const description = 'Extract article content from news URLs using Diffbot Article API' const schema = new Schema({ uuid: Schema.string() }) const boundaries = { getNewsItem: async (uuid: string) => { return await NewsModel.findOne({ uuid }).lean() }, saveToNewsContent: async (newsUuid: string, url: string, extractedContent: string, extractionMetrics: any) => { // Check if content already exists for this provider and news item const existingContent = await NewsContentModel.findOne({ newsUuid, provider: 'diffbot' }) if (existingContent) { console.log(`📝 [DIFFBOT] Updating existing NewsContent: ${existingContent.uuid}`) // Update existing content existingContent.extractedContent = extractedContent existingContent.extractionMetrics = extractionMetrics existingContent.updatedAt = new Date() return await existingContent.save() } else { console.log(`📝 [DIFFBOT] Creating new NewsContent`) // Create new content const newsContent = new NewsContentModel({ newsUuid, provider: 'diffbot', extractedContent, extractionMetrics }) return await newsContent.save() } }, extractContentWithDiffbot: async (url: string) => { const token = process.env.DIFFBOT_API_TOKEN if (!token) { throw new Error('DIFFBOT_API_TOKEN environment variable is required') } const startTime = Date.now() try { console.log(`🔍 [DIFFBOT] Extracting content from: ${url}`) const payload = { url } const searchParams = new URLSearchParams(payload) const response = await fetch( `https://api.diffbot.com/v3/article?token=${token}&${searchParams.toString()}`, { method: 'GET', headers: {} } ) if (!response.ok) { throw new Error(`Diffbot API error: ${response.status} ${response.statusText}`) } const result = await response.json() const extractionDuration = Date.now() - startTime console.log(`📊 [DIFFBOT] API Response:`, JSON.stringify(result, null, 2)) // Handle Article API response format let extractedContent = '' if (result.objects && result.objects.length > 0) { // Standard objects response const contentObject = result.objects[0] if (contentObject.text) { extractedContent = contentObject.text } else if (contentObject.html) { extractedContent = contentObject.html } } else if (result.text) { // Direct text response extractedContent = result.text } else if (result.html) { // Direct HTML response extractedContent = result.html } if (!extractedContent || extractedContent.trim().length === 0) { throw new Error(`No text content found in Diffbot response. Response structure: ${JSON.stringify(Object.keys(result))}`) } // Extract domain from URL for metrics const domain = new URL(url).hostname return { content: extractedContent, extractionMetrics: { contentLength: extractedContent.length, extractionDuration, extractedAt: new Date(), domain, success: true } } } catch (error) { console.error(`❌ [DIFFBOT] Extraction failed:`, error) const extractionDuration = Date.now() - startTime const domain = new URL(url).hostname throw { error: error instanceof Error ? error.message : 'Unknown error', extractionMetrics: { contentLength: 0, extractionDuration, extractedAt: new Date(), domain, success: false, errorMessage: error instanceof Error ? error.message : 'Unknown error' } } } } } export const extractContent = createTask({ name, description, schema, boundaries, fn: async function ({ uuid }, { getNewsItem, saveToNewsContent, extractContentWithDiffbot }) { console.log(`🤖 [DIFFBOT] Starting content extraction for news item: ${uuid}`) // Get the news item const newsItem = await getNewsItem(uuid) if (!newsItem) { throw new Error(`News item not found with UUID: ${uuid}`) } console.log(`📄 [DIFFBOT] Found news item: "${newsItem.title}"`) console.log(`🔗 [DIFFBOT] URL: ${newsItem.url}`) try { // Extract content using Diffbot const extraction = await extractContentWithDiffbot(newsItem.url) // Save to NewsContent collection const newsContent = await saveToNewsContent( newsItem.uuid, newsItem.url, extraction.content, extraction.extractionMetrics ) console.log(`✅ [DIFFBOT] Successfully extracted ${extraction.content.length} characters`) console.log(`📝 [DIFFBOT] NewsContent UUID: ${newsContent.uuid} (${newsContent.isNew ? 'created' : 'updated'})`) console.log(`🔍 [DIFFBOT] Content preview: ${extraction.content.substring(0, 200)}...`) return { success: true, message: `Diffbot extraction completed for: ${newsItem.title}`, data: { newsUuid: newsItem.uuid, newsContentUuid: newsContent.uuid, provider: 'diffbot', contentLength: extraction.content.length, extractionDuration: extraction.extractionMetrics.extractionDuration, domain: extraction.extractionMetrics.domain } } } catch (error: any) { console.error(`❌ [DIFFBOT] Failed to extract content for ${uuid}:`, error) // If we have extraction metrics from the error, save failed attempt if (error.extractionMetrics) { try { await saveToNewsContent( newsItem.uuid, newsItem.url, '', // Empty content for failed extraction error.extractionMetrics ) console.log(`📝 [DIFFBOT] Saved failed extraction metrics to NewsContent`) } catch (saveError) { console.error(`❌ [DIFFBOT] Failed to save error metrics:`, saveError) } } // Re-throw the error for proper task failure handling throw error.error || error } } })