Adds PDF content extraction.
This commit is contained in:
parent
3897fdc688
commit
2c2db1a42e
4 changed files with 64 additions and 29 deletions
17
app/src/main/kotlin/net/h34t/filemure/ContentExtractor.kt
Normal file
17
app/src/main/kotlin/net/h34t/filemure/ContentExtractor.kt
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
package net.h34t.filemure
|
||||||
|
|
||||||
|
import org.apache.pdfbox.Loader
|
||||||
|
import org.apache.pdfbox.text.PDFTextStripper
|
||||||
|
|
||||||
|
class ContentExtractor {
|
||||||
|
|
||||||
|
fun extractPdf(pdfBytes: ByteArray): String {
|
||||||
|
val doc = Loader.loadPDF(pdfBytes)
|
||||||
|
|
||||||
|
return PDFTextStripper().getText(doc)
|
||||||
|
}
|
||||||
|
|
||||||
|
fun extractPlain(bytes: ByteArray): String {
|
||||||
|
return bytes.toString(Charsets.UTF_8)
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,12 +1,15 @@
|
||||||
package net.h34t.filemure.controller
|
package net.h34t.filemure.controller
|
||||||
|
|
||||||
import io.javalin.http.Context
|
import io.javalin.http.Context
|
||||||
|
import net.h34t.filemure.ContentExtractor
|
||||||
import net.h34t.filemure.TemplateModifiers
|
import net.h34t.filemure.TemplateModifiers
|
||||||
import net.h34t.filemure.repository.SqliteRepository
|
import net.h34t.filemure.repository.SqliteRepository
|
||||||
import net.h34t.filemure.requireSession
|
import net.h34t.filemure.requireSession
|
||||||
|
|
||||||
class UploadController(val modifiers: TemplateModifiers, val repository: SqliteRepository) {
|
class UploadController(val modifiers: TemplateModifiers, val repository: SqliteRepository) {
|
||||||
|
|
||||||
|
private val pdfContentExtractor = ContentExtractor()
|
||||||
|
|
||||||
fun upload(ctx: Context) {
|
fun upload(ctx: Context) {
|
||||||
val session = ctx.requireSession()
|
val session = ctx.requireSession()
|
||||||
|
|
||||||
|
@ -16,13 +19,30 @@ class UploadController(val modifiers: TemplateModifiers, val repository: SqliteR
|
||||||
|
|
||||||
val files = ctx.uploadedFiles()
|
val files = ctx.uploadedFiles()
|
||||||
|
|
||||||
|
val extIds = files.map {
|
||||||
|
it.contentAndClose { contentStream ->
|
||||||
|
val content = contentStream.readAllBytes()
|
||||||
|
val contentType = it.contentType()
|
||||||
|
|
||||||
|
val contentExtracted = when (contentType) {
|
||||||
|
"application/pdf" -> pdfContentExtractor.extractPdf(content)
|
||||||
|
"text/plain" -> pdfContentExtractor.extractPlain(content)
|
||||||
|
else -> ""
|
||||||
|
}
|
||||||
|
|
||||||
|
repository.addFileToLimbo(
|
||||||
|
accountid,
|
||||||
|
it.filename(),
|
||||||
|
contentType,
|
||||||
|
it.size(),
|
||||||
|
contentExtracted,
|
||||||
|
content
|
||||||
|
).extId
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
when (target) {
|
when (target) {
|
||||||
"document" -> {
|
"document" -> {
|
||||||
val extIds = files.map {
|
|
||||||
it.contentAndClose { content ->
|
|
||||||
repository.addFileToLimbo(accountid, it.filename(), it.contentType(), it.size(), content).extId
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ctx.status(200)
|
ctx.status(200)
|
||||||
ctx.json(
|
ctx.json(
|
||||||
Result(
|
Result(
|
||||||
|
@ -33,11 +53,6 @@ class UploadController(val modifiers: TemplateModifiers, val repository: SqliteR
|
||||||
}
|
}
|
||||||
|
|
||||||
"limbo" -> {
|
"limbo" -> {
|
||||||
files.forEach {
|
|
||||||
it.contentAndClose { content ->
|
|
||||||
repository.addFileToLimbo(accountid, it.filename(), it.contentType(), it.size(), content)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ctx.status(200)
|
ctx.status(200)
|
||||||
ctx.json(
|
ctx.json(
|
||||||
Result(
|
Result(
|
||||||
|
|
|
@ -122,7 +122,8 @@ class SqliteRepository(url: String) {
|
||||||
filename: String,
|
filename: String,
|
||||||
contentType: String?,
|
contentType: String?,
|
||||||
size: Long,
|
size: Long,
|
||||||
content: InputStream
|
contentExtracted: String,
|
||||||
|
content: ByteArray
|
||||||
): IdPair =
|
): IdPair =
|
||||||
database.databaseQueries.transactionWithResult {
|
database.databaseQueries.transactionWithResult {
|
||||||
val extId = generateExtId()
|
val extId = generateExtId()
|
||||||
|
@ -132,7 +133,8 @@ class SqliteRepository(url: String) {
|
||||||
filename = filename,
|
filename = filename,
|
||||||
content_type = contentType,
|
content_type = contentType,
|
||||||
file_size = size,
|
file_size = size,
|
||||||
content = content.readAllBytes()
|
content_extracted = contentExtracted,
|
||||||
|
content = content
|
||||||
)
|
)
|
||||||
IdPair(
|
IdPair(
|
||||||
id = lastInsertedId(),
|
id = lastInsertedId(),
|
||||||
|
@ -140,7 +142,6 @@ class SqliteRepository(url: String) {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
fun addNewFileToDocument(
|
fun addNewFileToDocument(
|
||||||
accountId: Long,
|
accountId: Long,
|
||||||
documentId: Long,
|
documentId: Long,
|
||||||
|
|
|
@ -70,7 +70,7 @@ insertFileForDocument:
|
||||||
INSERT INTO file (account_id, document_id, ext_id, filename, content_type, file_size, content) VALUES (?, ?,?,?,?,?,?);
|
INSERT INTO file (account_id, document_id, ext_id, filename, content_type, file_size, content) VALUES (?, ?,?,?,?,?,?);
|
||||||
|
|
||||||
insertFileIntoLimbo:
|
insertFileIntoLimbo:
|
||||||
INSERT INTO file (account_id, ext_id, filename, content_type, file_size, content) VALUES (?,?,?,?,?,?);
|
INSERT INTO file (account_id, ext_id, filename, content_type, file_size, content_extracted, content) VALUES (?,?,?,?,?,?, ?);
|
||||||
|
|
||||||
getLimboFileCount:
|
getLimboFileCount:
|
||||||
SELECT count(*) AS count FROM file WHERE account_id=? AND document_id IS NULL AND state=?;
|
SELECT count(*) AS count FROM file WHERE account_id=? AND document_id IS NULL AND state=?;
|
||||||
|
@ -221,21 +221,23 @@ UPDATE file SET state=? WHERE account_id=? AND ext_id IN ?;
|
||||||
|
|
||||||
searchDocument:
|
searchDocument:
|
||||||
SELECT
|
SELECT
|
||||||
id,
|
d.id,
|
||||||
account_id,
|
d.account_id,
|
||||||
ext_id,
|
d.ext_id,
|
||||||
title,
|
d.title,
|
||||||
description,
|
d.description,
|
||||||
tags,
|
d.tags,
|
||||||
created,
|
d.created,
|
||||||
reference_date,
|
d.reference_date,
|
||||||
state
|
d.state
|
||||||
FROM
|
FROM
|
||||||
document
|
document d LEFT OUTER JOIN file f ON (f.document_id = d.id)
|
||||||
WHERE
|
WHERE
|
||||||
account_id = :account_id AND
|
d.account_id = :account_id AND
|
||||||
state = :state AND
|
d.state = :state AND
|
||||||
(title LIKE :query OR
|
(d.title LIKE :query OR
|
||||||
description LIKE :query AND
|
d.description LIKE :query OR
|
||||||
tags LIKE :query);
|
d.tags LIKE :query OR
|
||||||
|
f.filename LIKE :query OR
|
||||||
|
f.content_extracted LIKE :query);
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue