Adds PDF content extraction.
This commit is contained in:
parent
3897fdc688
commit
2c2db1a42e
4 changed files with 64 additions and 29 deletions
17
app/src/main/kotlin/net/h34t/filemure/ContentExtractor.kt
Normal file
17
app/src/main/kotlin/net/h34t/filemure/ContentExtractor.kt
Normal file
|
@ -0,0 +1,17 @@
|
|||
package net.h34t.filemure
|
||||
|
||||
import org.apache.pdfbox.Loader
|
||||
import org.apache.pdfbox.text.PDFTextStripper
|
||||
|
||||
class ContentExtractor {
|
||||
|
||||
fun extractPdf(pdfBytes: ByteArray): String {
|
||||
val doc = Loader.loadPDF(pdfBytes)
|
||||
|
||||
return PDFTextStripper().getText(doc)
|
||||
}
|
||||
|
||||
fun extractPlain(bytes: ByteArray): String {
|
||||
return bytes.toString(Charsets.UTF_8)
|
||||
}
|
||||
}
|
|
@ -1,12 +1,15 @@
|
|||
package net.h34t.filemure.controller
|
||||
|
||||
import io.javalin.http.Context
|
||||
import net.h34t.filemure.ContentExtractor
|
||||
import net.h34t.filemure.TemplateModifiers
|
||||
import net.h34t.filemure.repository.SqliteRepository
|
||||
import net.h34t.filemure.requireSession
|
||||
|
||||
class UploadController(val modifiers: TemplateModifiers, val repository: SqliteRepository) {
|
||||
|
||||
private val pdfContentExtractor = ContentExtractor()
|
||||
|
||||
fun upload(ctx: Context) {
|
||||
val session = ctx.requireSession()
|
||||
|
||||
|
@ -16,13 +19,30 @@ class UploadController(val modifiers: TemplateModifiers, val repository: SqliteR
|
|||
|
||||
val files = ctx.uploadedFiles()
|
||||
|
||||
val extIds = files.map {
|
||||
it.contentAndClose { contentStream ->
|
||||
val content = contentStream.readAllBytes()
|
||||
val contentType = it.contentType()
|
||||
|
||||
val contentExtracted = when (contentType) {
|
||||
"application/pdf" -> pdfContentExtractor.extractPdf(content)
|
||||
"text/plain" -> pdfContentExtractor.extractPlain(content)
|
||||
else -> ""
|
||||
}
|
||||
|
||||
repository.addFileToLimbo(
|
||||
accountid,
|
||||
it.filename(),
|
||||
contentType,
|
||||
it.size(),
|
||||
contentExtracted,
|
||||
content
|
||||
).extId
|
||||
}
|
||||
}
|
||||
|
||||
when (target) {
|
||||
"document" -> {
|
||||
val extIds = files.map {
|
||||
it.contentAndClose { content ->
|
||||
repository.addFileToLimbo(accountid, it.filename(), it.contentType(), it.size(), content).extId
|
||||
}
|
||||
}
|
||||
ctx.status(200)
|
||||
ctx.json(
|
||||
Result(
|
||||
|
@ -33,11 +53,6 @@ class UploadController(val modifiers: TemplateModifiers, val repository: SqliteR
|
|||
}
|
||||
|
||||
"limbo" -> {
|
||||
files.forEach {
|
||||
it.contentAndClose { content ->
|
||||
repository.addFileToLimbo(accountid, it.filename(), it.contentType(), it.size(), content)
|
||||
}
|
||||
}
|
||||
ctx.status(200)
|
||||
ctx.json(
|
||||
Result(
|
||||
|
|
|
@ -122,7 +122,8 @@ class SqliteRepository(url: String) {
|
|||
filename: String,
|
||||
contentType: String?,
|
||||
size: Long,
|
||||
content: InputStream
|
||||
contentExtracted: String,
|
||||
content: ByteArray
|
||||
): IdPair =
|
||||
database.databaseQueries.transactionWithResult {
|
||||
val extId = generateExtId()
|
||||
|
@ -132,7 +133,8 @@ class SqliteRepository(url: String) {
|
|||
filename = filename,
|
||||
content_type = contentType,
|
||||
file_size = size,
|
||||
content = content.readAllBytes()
|
||||
content_extracted = contentExtracted,
|
||||
content = content
|
||||
)
|
||||
IdPair(
|
||||
id = lastInsertedId(),
|
||||
|
@ -140,7 +142,6 @@ class SqliteRepository(url: String) {
|
|||
)
|
||||
}
|
||||
|
||||
|
||||
fun addNewFileToDocument(
|
||||
accountId: Long,
|
||||
documentId: Long,
|
||||
|
|
|
@ -70,7 +70,7 @@ insertFileForDocument:
|
|||
INSERT INTO file (account_id, document_id, ext_id, filename, content_type, file_size, content) VALUES (?, ?,?,?,?,?,?);
|
||||
|
||||
insertFileIntoLimbo:
|
||||
INSERT INTO file (account_id, ext_id, filename, content_type, file_size, content) VALUES (?,?,?,?,?,?);
|
||||
INSERT INTO file (account_id, ext_id, filename, content_type, file_size, content_extracted, content) VALUES (?,?,?,?,?,?, ?);
|
||||
|
||||
getLimboFileCount:
|
||||
SELECT count(*) AS count FROM file WHERE account_id=? AND document_id IS NULL AND state=?;
|
||||
|
@ -221,21 +221,23 @@ UPDATE file SET state=? WHERE account_id=? AND ext_id IN ?;
|
|||
|
||||
searchDocument:
|
||||
SELECT
|
||||
id,
|
||||
account_id,
|
||||
ext_id,
|
||||
title,
|
||||
description,
|
||||
tags,
|
||||
created,
|
||||
reference_date,
|
||||
state
|
||||
d.id,
|
||||
d.account_id,
|
||||
d.ext_id,
|
||||
d.title,
|
||||
d.description,
|
||||
d.tags,
|
||||
d.created,
|
||||
d.reference_date,
|
||||
d.state
|
||||
FROM
|
||||
document
|
||||
document d LEFT OUTER JOIN file f ON (f.document_id = d.id)
|
||||
WHERE
|
||||
account_id = :account_id AND
|
||||
state = :state AND
|
||||
(title LIKE :query OR
|
||||
description LIKE :query AND
|
||||
tags LIKE :query);
|
||||
d.account_id = :account_id AND
|
||||
d.state = :state AND
|
||||
(d.title LIKE :query OR
|
||||
d.description LIKE :query OR
|
||||
d.tags LIKE :query OR
|
||||
f.filename LIKE :query OR
|
||||
f.content_extracted LIKE :query);
|
||||
|
||||
|
|
Loading…
Reference in a new issue