You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

170 lines
5.5 KiB
C#

using System;
using System.Diagnostics;
using System.Drawing;
using System.Drawing.Imaging;
using System.IO;
using System.Linq;
using System.Text;
namespace PINBlog
{
public enum LANGUAGE { ENG = 0 }
public class OCRResultInfo
{
public LANGUAGE Language { get; set; }
public string Result { get; set; }
public OCRResultInfo()
{
Language = LANGUAGE.ENG;
Result = "";
}
}
public class Tesseract
{
private string m_TesseractExePath;
private LANGUAGE m_Language;
/// <summary>
/// Initializes a new instance of the <see cref="TesseractService"/> class.
/// </summary>
/// <param name="tesseractDir">The path for the Tesseract4 installation folder (C:\Program Files\Tesseract-OCR).</param>
/// <param name="language">The language used to extract text from images (eng, por, etc)</param>
/// <param name="dataDir">The data with the trained models (tessdata). Download the models from https://github.com/tesseract-ocr/tessdata_fast</param>
public Tesseract(LANGUAGE language = LANGUAGE.ENG)
{
// Tesseract configs.
var dir = Path.Combine(".", "tessdata");
m_TesseractExePath = Path.Combine(dir,"tesseract.exe");
m_Language = language;
Environment.SetEnvironmentVariable("TESSDATA_PREFIX", dir);
}
public Stream ToStream(Image image, ImageFormat format = null)
{
var stream = new System.IO.MemoryStream();
if (format == null)
{
image.Save(stream, image.RawFormat);
}
else
{
image.Save(stream, format);
}
stream.Position = 0;
return stream;
}
/// <summary>
/// Read text from the images streams.
/// </summary>
/// <param name="images">The images streams.</param>
/// <returns>The images text.</returns>
public string GetText(params Stream[] images)
{
var output = string.Empty;
if (images.Any())
{
var tempPath = Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString());
Directory.CreateDirectory(tempPath);
var tempInputFile = NewTempFileName(tempPath);
var tempOutputFile = NewTempFileName(tempPath);
try
{
WriteInputFiles(images, tempPath, tempInputFile);
var info = new ProcessStartInfo
{
FileName = m_TesseractExePath,
Arguments = $"{tempInputFile} {tempOutputFile} -l {m_Language.ToString()}",
RedirectStandardError = true,
RedirectStandardOutput = true,
CreateNoWindow = true,
UseShellExecute = false
};
using (var ps = Process.Start(info))
{
ps.WaitForExit();
var exitCode = ps.ExitCode;
if (exitCode == 0)
{
output = File.ReadAllText(tempOutputFile + ".txt");
var charset = new char[] { ' ', '\n', '\f' };
output = output.TrimStart(charset).TrimEnd(charset);
}
else
{
var stderr = ps.StandardError.ReadToEnd();
throw new InvalidOperationException(stderr);
}
}
}
finally
{
Directory.Delete(tempPath, true);
}
}
return output;
}
private void WriteInputFiles(Stream[] inputStreams, string tempPath, string tempInputFile)
{
// If there is more thant one image file, so build the list file using the images as input files.
if (inputStreams.Length > 1)
{
var imagesListFileContent = new StringBuilder();
foreach (var inputStream in inputStreams)
{
var imageFile = NewTempFileName(tempPath);
using (var tempStream = File.OpenWrite(imageFile))
{
CopyStream(inputStream, tempStream);
}
imagesListFileContent.AppendLine(imageFile);
}
File.WriteAllText(tempInputFile, imagesListFileContent.ToString());
}
else
{
// If is only one image file, than use the image file as input file.
using (var tempStream = File.OpenWrite(tempInputFile))
{
CopyStream(inputStreams.First(), tempStream);
}
}
}
private void CopyStream(Stream input, Stream output)
{
if (input.CanSeek)
input.Seek(0, SeekOrigin.Begin);
//input.CopyTo(output);
Byte[] buffer = new byte[input.Length];
int len = input.Read(buffer, 0, buffer.Length);
output.Write(buffer, 0, len);
input.Close();
}
private string NewTempFileName(string tempPath)
{
return Path.Combine(tempPath, Guid.NewGuid().ToString());
}
}
}