Adds PDF content extraction.

This commit is contained in:
Stefan Schallerl 2025-02-07 15:46:09 +01:00
parent 3897fdc688
commit 2c2db1a42e
4 changed files with 64 additions and 29 deletions

View file

@ -0,0 +1,17 @@
package net.h34t.filemure
import org.apache.pdfbox.Loader
import org.apache.pdfbox.text.PDFTextStripper
class ContentExtractor {
fun extractPdf(pdfBytes: ByteArray): String {
val doc = Loader.loadPDF(pdfBytes)
return PDFTextStripper().getText(doc)
}
fun extractPlain(bytes: ByteArray): String {
return bytes.toString(Charsets.UTF_8)
}
}

View file

@ -1,12 +1,15 @@
package net.h34t.filemure.controller
import io.javalin.http.Context
import net.h34t.filemure.ContentExtractor
import net.h34t.filemure.TemplateModifiers
import net.h34t.filemure.repository.SqliteRepository
import net.h34t.filemure.requireSession
class UploadController(val modifiers: TemplateModifiers, val repository: SqliteRepository) {
private val pdfContentExtractor = ContentExtractor()
fun upload(ctx: Context) {
val session = ctx.requireSession()
@ -16,13 +19,30 @@ class UploadController(val modifiers: TemplateModifiers, val repository: SqliteR
val files = ctx.uploadedFiles()
val extIds = files.map {
it.contentAndClose { contentStream ->
val content = contentStream.readAllBytes()
val contentType = it.contentType()
val contentExtracted = when (contentType) {
"application/pdf" -> pdfContentExtractor.extractPdf(content)
"text/plain" -> pdfContentExtractor.extractPlain(content)
else -> ""
}
repository.addFileToLimbo(
accountid,
it.filename(),
contentType,
it.size(),
contentExtracted,
content
).extId
}
}
when (target) {
"document" -> {
val extIds = files.map {
it.contentAndClose { content ->
repository.addFileToLimbo(accountid, it.filename(), it.contentType(), it.size(), content).extId
}
}
ctx.status(200)
ctx.json(
Result(
@ -33,11 +53,6 @@ class UploadController(val modifiers: TemplateModifiers, val repository: SqliteR
}
"limbo" -> {
files.forEach {
it.contentAndClose { content ->
repository.addFileToLimbo(accountid, it.filename(), it.contentType(), it.size(), content)
}
}
ctx.status(200)
ctx.json(
Result(

View file

@ -122,7 +122,8 @@ class SqliteRepository(url: String) {
filename: String,
contentType: String?,
size: Long,
content: InputStream
contentExtracted: String,
content: ByteArray
): IdPair =
database.databaseQueries.transactionWithResult {
val extId = generateExtId()
@ -132,7 +133,8 @@ class SqliteRepository(url: String) {
filename = filename,
content_type = contentType,
file_size = size,
content = content.readAllBytes()
content_extracted = contentExtracted,
content = content
)
IdPair(
id = lastInsertedId(),
@ -140,7 +142,6 @@ class SqliteRepository(url: String) {
)
}
fun addNewFileToDocument(
accountId: Long,
documentId: Long,

View file

@ -70,7 +70,7 @@ insertFileForDocument:
INSERT INTO file (account_id, document_id, ext_id, filename, content_type, file_size, content) VALUES (?, ?,?,?,?,?,?);
insertFileIntoLimbo:
INSERT INTO file (account_id, ext_id, filename, content_type, file_size, content) VALUES (?,?,?,?,?,?);
INSERT INTO file (account_id, ext_id, filename, content_type, file_size, content_extracted, content) VALUES (?,?,?,?,?,?, ?);
getLimboFileCount:
SELECT count(*) AS count FROM file WHERE account_id=? AND document_id IS NULL AND state=?;
@ -221,21 +221,23 @@ UPDATE file SET state=? WHERE account_id=? AND ext_id IN ?;
searchDocument:
SELECT
id,
account_id,
ext_id,
title,
description,
tags,
created,
reference_date,
state
d.id,
d.account_id,
d.ext_id,
d.title,
d.description,
d.tags,
d.created,
d.reference_date,
d.state
FROM
document
document d LEFT OUTER JOIN file f ON (f.document_id = d.id)
WHERE
account_id = :account_id AND
state = :state AND
(title LIKE :query OR
description LIKE :query AND
tags LIKE :query);
d.account_id = :account_id AND
d.state = :state AND
(d.title LIKE :query OR
d.description LIKE :query OR
d.tags LIKE :query OR
f.filename LIKE :query OR
f.content_extracted LIKE :query);