Adds PDF content extraction.

This commit is contained in:
Stefan Schallerl 2025-02-07 15:46:09 +01:00
parent 3897fdc688
commit 2c2db1a42e
4 changed files with 64 additions and 29 deletions

View file

@ -0,0 +1,17 @@
package net.h34t.filemure
import org.apache.pdfbox.Loader
import org.apache.pdfbox.text.PDFTextStripper
class ContentExtractor {
fun extractPdf(pdfBytes: ByteArray): String {
val doc = Loader.loadPDF(pdfBytes)
return PDFTextStripper().getText(doc)
}
fun extractPlain(bytes: ByteArray): String {
return bytes.toString(Charsets.UTF_8)
}
}

View file

@ -1,12 +1,15 @@
package net.h34t.filemure.controller package net.h34t.filemure.controller
import io.javalin.http.Context import io.javalin.http.Context
import net.h34t.filemure.ContentExtractor
import net.h34t.filemure.TemplateModifiers import net.h34t.filemure.TemplateModifiers
import net.h34t.filemure.repository.SqliteRepository import net.h34t.filemure.repository.SqliteRepository
import net.h34t.filemure.requireSession import net.h34t.filemure.requireSession
class UploadController(val modifiers: TemplateModifiers, val repository: SqliteRepository) { class UploadController(val modifiers: TemplateModifiers, val repository: SqliteRepository) {
private val pdfContentExtractor = ContentExtractor()
fun upload(ctx: Context) { fun upload(ctx: Context) {
val session = ctx.requireSession() val session = ctx.requireSession()
@ -16,13 +19,30 @@ class UploadController(val modifiers: TemplateModifiers, val repository: SqliteR
val files = ctx.uploadedFiles() val files = ctx.uploadedFiles()
val extIds = files.map {
it.contentAndClose { contentStream ->
val content = contentStream.readAllBytes()
val contentType = it.contentType()
val contentExtracted = when (contentType) {
"application/pdf" -> pdfContentExtractor.extractPdf(content)
"text/plain" -> pdfContentExtractor.extractPlain(content)
else -> ""
}
repository.addFileToLimbo(
accountid,
it.filename(),
contentType,
it.size(),
contentExtracted,
content
).extId
}
}
when (target) { when (target) {
"document" -> { "document" -> {
val extIds = files.map {
it.contentAndClose { content ->
repository.addFileToLimbo(accountid, it.filename(), it.contentType(), it.size(), content).extId
}
}
ctx.status(200) ctx.status(200)
ctx.json( ctx.json(
Result( Result(
@ -33,11 +53,6 @@ class UploadController(val modifiers: TemplateModifiers, val repository: SqliteR
} }
"limbo" -> { "limbo" -> {
files.forEach {
it.contentAndClose { content ->
repository.addFileToLimbo(accountid, it.filename(), it.contentType(), it.size(), content)
}
}
ctx.status(200) ctx.status(200)
ctx.json( ctx.json(
Result( Result(

View file

@ -122,7 +122,8 @@ class SqliteRepository(url: String) {
filename: String, filename: String,
contentType: String?, contentType: String?,
size: Long, size: Long,
content: InputStream contentExtracted: String,
content: ByteArray
): IdPair = ): IdPair =
database.databaseQueries.transactionWithResult { database.databaseQueries.transactionWithResult {
val extId = generateExtId() val extId = generateExtId()
@ -132,7 +133,8 @@ class SqliteRepository(url: String) {
filename = filename, filename = filename,
content_type = contentType, content_type = contentType,
file_size = size, file_size = size,
content = content.readAllBytes() content_extracted = contentExtracted,
content = content
) )
IdPair( IdPair(
id = lastInsertedId(), id = lastInsertedId(),
@ -140,7 +142,6 @@ class SqliteRepository(url: String) {
) )
} }
fun addNewFileToDocument( fun addNewFileToDocument(
accountId: Long, accountId: Long,
documentId: Long, documentId: Long,

View file

@ -70,7 +70,7 @@ insertFileForDocument:
INSERT INTO file (account_id, document_id, ext_id, filename, content_type, file_size, content) VALUES (?, ?,?,?,?,?,?); INSERT INTO file (account_id, document_id, ext_id, filename, content_type, file_size, content) VALUES (?, ?,?,?,?,?,?);
insertFileIntoLimbo: insertFileIntoLimbo:
INSERT INTO file (account_id, ext_id, filename, content_type, file_size, content) VALUES (?,?,?,?,?,?); INSERT INTO file (account_id, ext_id, filename, content_type, file_size, content_extracted, content) VALUES (?,?,?,?,?,?, ?);
getLimboFileCount: getLimboFileCount:
SELECT count(*) AS count FROM file WHERE account_id=? AND document_id IS NULL AND state=?; SELECT count(*) AS count FROM file WHERE account_id=? AND document_id IS NULL AND state=?;
@ -221,21 +221,23 @@ UPDATE file SET state=? WHERE account_id=? AND ext_id IN ?;
searchDocument: searchDocument:
SELECT SELECT
id, d.id,
account_id, d.account_id,
ext_id, d.ext_id,
title, d.title,
description, d.description,
tags, d.tags,
created, d.created,
reference_date, d.reference_date,
state d.state
FROM FROM
document document d LEFT OUTER JOIN file f ON (f.document_id = d.id)
WHERE WHERE
account_id = :account_id AND d.account_id = :account_id AND
state = :state AND d.state = :state AND
(title LIKE :query OR (d.title LIKE :query OR
description LIKE :query AND d.description LIKE :query OR
tags LIKE :query); d.tags LIKE :query OR
f.filename LIKE :query OR
f.content_extracted LIKE :query);