diffbot:extractContent

Extract article content from news URLs using Diffbot Article API

Published September 4, 2025
Documentation

Overview

This task extracts full article content from news URLs using the Diffbot Article API. It retrieves a news item by UUID, uses Diffbot's AI-powered content extraction service to parse the article text from the URL, and stores the extracted content along with performance metrics in the database. The task handles both new extractions and updates to existing content, providing robust error handling and detailed logging throughout the process.

Implementation

The main function orchestrates a three-step content extraction workflow:

  1. News Item Retrieval: Calls the getNewsItem boundary function to fetch the news item by UUID and validate its existence from MongoDB database

  2. Content Extraction: Uses the extractContentWithDiffbot boundary function to process the news URL through Diffbot's Article API, which uses AI to identify and extract clean article text from web pages from Diffbot API service

  3. Content Storage: Calls saveToNewsContent boundary function to either create new or update existing NewsContent records with the extracted text and performance metrics to MongoDB database

The task includes comprehensive error handling that captures failed extractions with metrics, detailed console logging for monitoring, and content preview functionality for verification. If extraction fails, the task still attempts to save failure metrics before re-throwing the error for proper task failure handling.

Source Code
// TASK: extractContent
// Run this task with:
// forge task:run diffbot:extractContent --uuid="[NEWS_UUID]"

import { createTask } from '@forgehive/task'
import { Schema } from '@forgehive/schema'
import { NewsModel, NewsContentModel } from '../../models/index.js'

const name = 'diffbot:extractContent'
const description = 'Extract article content from news URLs using Diffbot Article API'

const schema = new Schema({
  uuid: Schema.string()
})

const boundaries = {
  getNewsItem: async (uuid: string) => {
    return await NewsModel.findOne({ uuid }).lean()
  },

  saveToNewsContent: async (newsUuid: string, url: string, extractedContent: string, extractionMetrics: any) => {
    // Check if content already exists for this provider and news item
    const existingContent = await NewsContentModel.findOne({
      newsUuid,
      provider: 'diffbot'
    })

    if (existingContent) {
      console.log(`📝 [DIFFBOT] Updating existing NewsContent: ${existingContent.uuid}`)
      // Update existing content
      existingContent.extractedContent = extractedContent
      existingContent.extractionMetrics = extractionMetrics
      existingContent.updatedAt = new Date()
      
      return await existingContent.save()
    } else {
      console.log(`📝 [DIFFBOT] Creating new NewsContent`)
      // Create new content
      const newsContent = new NewsContentModel({
        newsUuid,
        provider: 'diffbot',
        extractedContent,
        extractionMetrics
      })
      
      return await newsContent.save()
    }
  },

  extractContentWithDiffbot: async (url: string) => {
    const token = process.env.DIFFBOT_API_TOKEN
    if (!token) {
      throw new Error('DIFFBOT_API_TOKEN environment variable is required')
    }

    const startTime = Date.now()

    try {
      console.log(`🔍 [DIFFBOT] Extracting content from: ${url}`)

      const payload = { url }
      const searchParams = new URLSearchParams(payload)
      
      const response = await fetch(
        `https://api.diffbot.com/v3/article?token=${token}&${searchParams.toString()}`,
        {
          method: 'GET',
          headers: {}
        }
      )

      if (!response.ok) {
        throw new Error(`Diffbot API error: ${response.status} ${response.statusText}`)
      }

      const result = await response.json()
      const extractionDuration = Date.now() - startTime

      console.log(`📊 [DIFFBOT] API Response:`, JSON.stringify(result, null, 2))

      // Handle Article API response format
      let extractedContent = ''
      
      if (result.objects && result.objects.length > 0) {
        // Standard objects response
        const contentObject = result.objects[0]
        if (contentObject.text) {
          extractedContent = contentObject.text
        } else if (contentObject.html) {
          extractedContent = contentObject.html
        }
      } else if (result.text) {
        // Direct text response
        extractedContent = result.text
      } else if (result.html) {
        // Direct HTML response
        extractedContent = result.html
      }

      if (!extractedContent || extractedContent.trim().length === 0) {
        throw new Error(`No text content found in Diffbot response. Response structure: ${JSON.stringify(Object.keys(result))}`)
      }

      // Extract domain from URL for metrics
      const domain = new URL(url).hostname

      return {
        content: extractedContent,
        extractionMetrics: {
          contentLength: extractedContent.length,
          extractionDuration,
          extractedAt: new Date(),
          domain,
          success: true
        }
      }

    } catch (error) {
      console.error(`❌ [DIFFBOT] Extraction failed:`, error)
      
      const extractionDuration = Date.now() - startTime
      const domain = new URL(url).hostname

      throw {
        error: error instanceof Error ? error.message : 'Unknown error',
        extractionMetrics: {
          contentLength: 0,
          extractionDuration,
          extractedAt: new Date(),
          domain,
          success: false,
          errorMessage: error instanceof Error ? error.message : 'Unknown error'
        }
      }
    }
  }
}

export const extractContent = createTask({
  name,
  description,
  schema,
  boundaries,
  fn: async function ({ uuid }, { getNewsItem, saveToNewsContent, extractContentWithDiffbot }) {
    console.log(`🤖 [DIFFBOT] Starting content extraction for news item: ${uuid}`)

    // Get the news item
    const newsItem = await getNewsItem(uuid)
    if (!newsItem) {
      throw new Error(`News item not found with UUID: ${uuid}`)
    }

    console.log(`📄 [DIFFBOT] Found news item: "${newsItem.title}"`)
    console.log(`🔗 [DIFFBOT] URL: ${newsItem.url}`)

    try {
      // Extract content using Diffbot
      const extraction = await extractContentWithDiffbot(newsItem.url)

      // Save to NewsContent collection
      const newsContent = await saveToNewsContent(
        newsItem.uuid,
        newsItem.url,
        extraction.content,
        extraction.extractionMetrics
      )

      console.log(`✅ [DIFFBOT] Successfully extracted ${extraction.content.length} characters`)
      console.log(`📝 [DIFFBOT] NewsContent UUID: ${newsContent.uuid} (${newsContent.isNew ? 'created' : 'updated'})`)
      console.log(`🔍 [DIFFBOT] Content preview: ${extraction.content.substring(0, 200)}...`)

      return {
        success: true,
        message: `Diffbot extraction completed for: ${newsItem.title}`,
        data: {
          newsUuid: newsItem.uuid,
          newsContentUuid: newsContent.uuid,
          provider: 'diffbot',
          contentLength: extraction.content.length,
          extractionDuration: extraction.extractionMetrics.extractionDuration,
          domain: extraction.extractionMetrics.domain
        }
      }

    } catch (error: any) {
      console.error(`❌ [DIFFBOT] Failed to extract content for ${uuid}:`, error)

      // If we have extraction metrics from the error, save failed attempt
      if (error.extractionMetrics) {
        try {
          await saveToNewsContent(
            newsItem.uuid,
            newsItem.url,
            '', // Empty content for failed extraction
            error.extractionMetrics
          )
          console.log(`📝 [DIFFBOT] Saved failed extraction metrics to NewsContent`)
        } catch (saveError) {
          console.error(`❌ [DIFFBOT] Failed to save error metrics:`, saveError)
        }
      }

      // Re-throw the error for proper task failure handling
      throw error.error || error
    }
  }
})