GuilhermeStracini/POC-dotnet-ExtractPdfContent

View on GitHub
Src/POCExtractPdfContent/Extractors/TextSharpExtractor.cs

Summary

Maintainability
A
0 mins
Test Coverage
using System.Text;
using iTextSharp.text.pdf;

namespace POCExtractPdfContent.Extractors;

/// <summary>
/// Class ITextSharpExtractor.
/// Implements the <see cref="IPdfExtractor" />
/// </summary>
/// <seealso cref="IPdfExtractor" />
public class TextSharpExtractor : IPdfExtractor
{
    /// <summary>
    /// Extracts the specified data.
    /// </summary>
    /// <param name="data">The data.</param>
    /// <returns>System.String.</returns>
    public string Extract(byte[] data)
    {
        var stringsList = new List<string>();
        using var reader = new PdfReader(data);

        var contentBytes = reader.GetPageContent(1);
        var tokenizer = new PrTokeniser(new RandomAccessFileOrArray(contentBytes));

        while (tokenizer.NextToken())
        {
            if (tokenizer.TokenType != PrTokeniser.TK_STRING)
            {
                continue;
            }

            stringsList.Add(tokenizer.StringValue);
        }

        return string.Join("", stringsList);
    }
}