<?php
#show all errors
ini_set('display_errors', 1);
ini_set('display_startup_errors', 1);
error_reporting(E_ALL);

require 'vendor/autoload.php';

use thiagoalessio\TesseractOCR\TesseractOCR;

header('Content-Type: application/json');

// Check if Imagick extension is loaded
if (!extension_loaded('imagick')) {
    echo json_encode(['status' => 'error', 'message' => 'Imagick extension not installed']);
    exit;
}

if ($_SERVER['REQUEST_METHOD'] === 'POST') {
    if (isset($_FILES['pdf']) && $_FILES['pdf']['error'] === UPLOAD_ERR_OK) {
        $pdfFilePath = $_FILES['pdf']['tmp_name'];
        $outputDir = 'output_images';

        // Ensure the script has permission to create directories
        if (!is_writable($outputDir)) {
            var_dump($outputDir);
            echo json_encode(['status' => 'error', 'message' => 'Permission denied to create directory']);
            exit;
        }

        // Convert PDF to images
        $images = convertPdfToImages($pdfFilePath, $outputDir);

        // Initialize TesseractOCR
        $ocr = new TesseractOCR();

        $result = [];
        // Read text from each image
        foreach ($images as $image) {
            $text = $ocr->image($image)->run();
            $result[] = [
                'image' => $image,
                'text' => $text
            ];
        }

        // Extract required information
        $extractedData = extractInformation($result);

        echo json_encode(['status' => 'success', 'data' => $extractedData]);
    } else {
        echo json_encode(['status' => 'error', 'message' => 'Invalid file upload']);
    }
} else {
    echo json_encode(['status' => 'error', 'message' => 'Invalid request method']);
}

// Convert PDF to images
function convertPdfToImages($pdfFilePath, $outputDir)
{
    if (!file_exists($outputDir)) {
        mkdir($outputDir, 0777, true);
    }

    $imagick = new Imagick();
    $imagick->setResolution(300, 300);
    $imagick->readImage($pdfFilePath);
    $imagick->setImageFormat('png');

    $images = [];
    foreach ($imagick as $index => $image) {
        $imagePath = $outputDir . '/page-' . $index . '.png';
        $image->writeImage($imagePath);
        $images[] = $imagePath;
    }

    return $images;
}

// Function to extract course name, employee name, and release date
function extractInformation($ocrResults)
{
    $courseName = '';
    $employeeName = '';
    $releaseDate = '';

    foreach ($ocrResults as $result) {
        $text = $result['text'];

        // Extract employee name
        if (preg_match('/Conferito a\s+([A-Z\s]+)\n/', $text, $matches)) {
            $employeeName = trim($matches[1]);
        }

        // Extract release date
        //- Periodo della formazione: dal 11/10/2024 al 23/10/2024 we need to extract the date after "al"
        if (preg_match('/Periodo della formazione: dal \d{2}\/\d{2}\/\d{4} al (\d{2}\/\d{2}\/\d{4})/', $text, $matches)) {
            $releaseDate = $matches[1];
        }

        // Extract course name
        //Protocollo: 547A.51026628.MO.14806 we need to extract only the first 3 characters after "Protocollo: " and before the first dot
        if (preg_match('/Protocollo: ([A-Z0-9]{3})/', $text, $matches)) {
            $courseName = $matches[1];
        }
    }

    return [
        'course_name' => $courseName,
        'employee_name' => $employeeName,
        'release_date' => $releaseDate
    ];
}
