Compare commits
No commits in common. "98fb2948fc75c40b87ad5c9e1c27662e3193fca3" and "77424c48426147de07768abfa123da68a8f74f55" have entirely different histories.
98fb2948fc
...
77424c4842
11 changed files with 21 additions and 258 deletions
|
@ -11,8 +11,6 @@ dependencies {
|
||||||
implementation("app.cash.sqldelight:sqlite-driver:2.0.2")
|
implementation("app.cash.sqldelight:sqlite-driver:2.0.2")
|
||||||
implementation("com.fasterxml.jackson.core:jackson-databind:2.18.2")
|
implementation("com.fasterxml.jackson.core:jackson-databind:2.18.2")
|
||||||
implementation("com.fasterxml.jackson.module:jackson-module-kotlin:2.18.+")
|
implementation("com.fasterxml.jackson.module:jackson-module-kotlin:2.18.+")
|
||||||
implementation("com.squareup.okhttp3:okhttp:4.12.0")
|
|
||||||
|
|
||||||
implementation(libs.slf4jsimple)
|
implementation(libs.slf4jsimple)
|
||||||
implementation(libs.javalin)
|
implementation(libs.javalin)
|
||||||
implementation(libs.commonsText)
|
implementation(libs.commonsText)
|
||||||
|
|
|
@ -1,46 +1,14 @@
|
||||||
package net.h34t.filemure
|
package net.h34t.filemure
|
||||||
|
|
||||||
import net.h34t.filemure.classification.AIClassifier
|
|
||||||
import org.apache.pdfbox.Loader
|
import org.apache.pdfbox.Loader
|
||||||
import org.apache.pdfbox.rendering.PDFRenderer
|
|
||||||
import org.apache.pdfbox.text.PDFTextStripper
|
import org.apache.pdfbox.text.PDFTextStripper
|
||||||
import java.io.ByteArrayOutputStream
|
|
||||||
import javax.imageio.ImageIO
|
|
||||||
|
|
||||||
class ContentExtractor(
|
class ContentExtractor {
|
||||||
val aiClassifier: AIClassifier
|
|
||||||
) {
|
|
||||||
|
|
||||||
fun extractImage(contentType: String, imageBytes: ByteArray): String {
|
|
||||||
val text = aiClassifier.classifyImage(
|
|
||||||
prompt = "Analyze the given image and concise summary of its contents, including the type of document depicted (invoice, certificate, doctor's note, ...).",
|
|
||||||
contentType = contentType,
|
|
||||||
data = imageBytes,
|
|
||||||
)
|
|
||||||
|
|
||||||
println("AI image classification:\n\"\"\"\n$text\n\"\"\"")
|
|
||||||
|
|
||||||
return text
|
|
||||||
}
|
|
||||||
|
|
||||||
fun extractPdf(pdfBytes: ByteArray): String {
|
fun extractPdf(pdfBytes: ByteArray): String {
|
||||||
val doc = Loader.loadPDF(pdfBytes)
|
val doc = Loader.loadPDF(pdfBytes)
|
||||||
val text = PDFTextStripper().getText(doc)
|
|
||||||
|
|
||||||
return if (text.isNotBlank()) {
|
return PDFTextStripper().getText(doc)
|
||||||
text
|
|
||||||
} else {
|
|
||||||
val renderer = PDFRenderer(doc)
|
|
||||||
val bi = renderer.renderImage(0)
|
|
||||||
val baos = ByteArrayOutputStream()
|
|
||||||
|
|
||||||
ImageIO.write(bi, "JPG", baos)
|
|
||||||
|
|
||||||
// for debugging
|
|
||||||
// ImageIO.write(bi, "JPG", File.createTempFile("pdfimage-", ".jpg", File(".")))
|
|
||||||
|
|
||||||
extractImage("image/jpeg", baos.toByteArray())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fun extractPlain(bytes: ByteArray): String {
|
fun extractPlain(bytes: ByteArray): String {
|
||||||
|
|
|
@ -2,15 +2,12 @@ package net.h34t.filemure
|
||||||
|
|
||||||
import io.javalin.Javalin
|
import io.javalin.Javalin
|
||||||
import io.javalin.http.UnauthorizedResponse
|
import io.javalin.http.UnauthorizedResponse
|
||||||
import net.h34t.filemure.classification.AIClassifier
|
|
||||||
import net.h34t.filemure.controller.*
|
import net.h34t.filemure.controller.*
|
||||||
import net.h34t.filemure.repository.SqliteRepository
|
import net.h34t.filemure.repository.SqliteRepository
|
||||||
import net.h34t.filemure.tpl.Frame
|
import net.h34t.filemure.tpl.Frame
|
||||||
import net.h34t.filemure.tpl.Unauthorized
|
import net.h34t.filemure.tpl.Unauthorized
|
||||||
|
|
||||||
class FilemureApp(
|
class FilemureApp(repository: SqliteRepository) {
|
||||||
repository: SqliteRepository,
|
|
||||||
contentExtractor: ContentExtractor) {
|
|
||||||
|
|
||||||
private val modifiers = TemplateModifiers()
|
private val modifiers = TemplateModifiers()
|
||||||
|
|
||||||
|
@ -18,7 +15,7 @@ class FilemureApp(
|
||||||
|
|
||||||
private val overviewController = OverviewController(modifiers, repository)
|
private val overviewController = OverviewController(modifiers, repository)
|
||||||
private val limboController = LimboController(modifiers, repository)
|
private val limboController = LimboController(modifiers, repository)
|
||||||
private val uploadController = UploadController(modifiers, repository, contentExtractor)
|
private val uploadController = UploadController(modifiers, repository)
|
||||||
private val documentController = DocumentController(modifiers, repository)
|
private val documentController = DocumentController(modifiers, repository)
|
||||||
private val searchController = SearchController(modifiers, repository)
|
private val searchController = SearchController(modifiers, repository)
|
||||||
|
|
||||||
|
|
|
@ -2,34 +2,23 @@ package net.h34t.app.net.h34t.filemure
|
||||||
|
|
||||||
import io.javalin.Javalin
|
import io.javalin.Javalin
|
||||||
import io.javalin.http.staticfiles.Location
|
import io.javalin.http.staticfiles.Location
|
||||||
import net.h34t.filemure.ContentExtractor
|
|
||||||
import net.h34t.filemure.FilemureApp
|
import net.h34t.filemure.FilemureApp
|
||||||
import net.h34t.filemure.classification.AIClassifier
|
|
||||||
import net.h34t.filemure.repository.SqliteRepository
|
import net.h34t.filemure.repository.SqliteRepository
|
||||||
import org.eclipse.jetty.http.HttpCookie
|
import org.eclipse.jetty.http.HttpCookie
|
||||||
import org.eclipse.jetty.server.session.DefaultSessionCache
|
import org.eclipse.jetty.server.session.DefaultSessionCache
|
||||||
import org.eclipse.jetty.server.session.FileSessionDataStore
|
import org.eclipse.jetty.server.session.FileSessionDataStore
|
||||||
import org.eclipse.jetty.server.session.SessionHandler
|
import org.eclipse.jetty.server.session.SessionHandler
|
||||||
import java.io.File
|
import java.io.File
|
||||||
import java.time.Duration
|
|
||||||
import java.time.LocalDateTime
|
import java.time.LocalDateTime
|
||||||
import java.time.format.DateTimeFormatter
|
import java.time.format.DateTimeFormatter
|
||||||
|
|
||||||
fun main() {
|
fun main() {
|
||||||
val dtf = DateTimeFormatter.ISO_DATE_TIME
|
val dtf = DateTimeFormatter.ISO_DATE_TIME
|
||||||
|
|
||||||
val db = System.getenv("db_path")
|
val db = System.getenv("dbpath")
|
||||||
?: throw IllegalArgumentException("Please define an env dbpath, e.g. /data/filemure.db")
|
?: throw IllegalArgumentException("Please define an env dbpath, e.g. /data/filemure.db")
|
||||||
|
|
||||||
val sessionExpiry = System.getenv("session_expiry_sec")?.toInt() ?: (Duration.ofMinutes(30).toSeconds().toInt())
|
val app = FilemureApp(SqliteRepository("jdbc:sqlite:$db"))
|
||||||
|
|
||||||
val aiClassifier = AIClassifier(
|
|
||||||
key = System.getenv("ai_key")
|
|
||||||
)
|
|
||||||
|
|
||||||
val contentExtractor = ContentExtractor(aiClassifier)
|
|
||||||
|
|
||||||
val app = FilemureApp(SqliteRepository("jdbc:sqlite:$db"), contentExtractor)
|
|
||||||
|
|
||||||
Javalin
|
Javalin
|
||||||
.create { config ->
|
.create { config ->
|
||||||
|
@ -46,7 +35,7 @@ fun main() {
|
||||||
config.useVirtualThreads = true
|
config.useVirtualThreads = true
|
||||||
|
|
||||||
config.jetty.modifyServletContextHandler {
|
config.jetty.modifyServletContextHandler {
|
||||||
it.sessionHandler = fileSessionHandler(sessionExpiry)
|
it.sessionHandler = fileSessionHandler()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
.also {
|
.also {
|
||||||
|
@ -55,14 +44,14 @@ fun main() {
|
||||||
.start(7070)
|
.start(7070)
|
||||||
}
|
}
|
||||||
|
|
||||||
fun fileSessionHandler(expirySec: Int) = SessionHandler().apply {
|
fun fileSessionHandler() = SessionHandler().apply {
|
||||||
sessionCache = DefaultSessionCache(this).apply {
|
sessionCache = DefaultSessionCache(this).apply {
|
||||||
sessionDataStore = FileSessionDataStore().apply {
|
sessionDataStore = FileSessionDataStore().apply {
|
||||||
val baseDir = File(System.getProperty("java.io.tmpdir"))
|
val baseDir = File(System.getProperty("java.io.tmpdir"))
|
||||||
this.storeDir = File(baseDir, "javalin-session").apply { mkdirs() }
|
this.storeDir = File(baseDir, "javalin-session").apply { mkdirs() }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
maxInactiveInterval = expirySec
|
maxInactiveInterval = 30 * 60
|
||||||
httpOnly = true
|
httpOnly = true
|
||||||
isSecureRequestOnly = true
|
isSecureRequestOnly = true
|
||||||
sameSite = HttpCookie.SameSite.STRICT
|
sameSite = HttpCookie.SameSite.STRICT
|
||||||
|
|
|
@ -8,20 +8,12 @@ class TemplateModifiers : Frame.Modifiers, Limbo.Modifiers, DocumentCreateForm.M
|
||||||
FilePreview.Modifiers, DocumentEditForm.Modifiers, FileList.Modifiers,
|
FilePreview.Modifiers, DocumentEditForm.Modifiers, FileList.Modifiers,
|
||||||
net.h34t.filemure.tpl.Document.Modifiers, OverviewDocuments.Modifiers, Search.Modifiers, Tags.Modifiers {
|
net.h34t.filemure.tpl.Document.Modifiers, OverviewDocuments.Modifiers, Search.Modifiers, Tags.Modifiers {
|
||||||
|
|
||||||
private val linebreaks = Regex("\\v+")
|
|
||||||
|
|
||||||
fun hashPrefix(arg: String): String {
|
fun hashPrefix(arg: String): String {
|
||||||
return URLEncoder.encode(arg, Charsets.UTF_8)
|
return URLEncoder.encode(arg, Charsets.UTF_8)
|
||||||
}
|
}
|
||||||
|
|
||||||
override fun starPrefix(arg: String) = html(arg)
|
override fun starPrefix(arg: String): String {
|
||||||
|
|
||||||
override fun html(arg: String): String {
|
|
||||||
return StringEscapeUtils.escapeHtml4(arg)
|
return StringEscapeUtils.escapeHtml4(arg)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// override fun nl2br(arg: String): String {
|
|
||||||
// return arg.replace(linebreaks, "<br/>\n")
|
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -45,14 +45,12 @@ value class Tag private constructor(val value: String) {
|
||||||
private val splitter = Regex("\\s+")
|
private val splitter = Regex("\\s+")
|
||||||
fun of(value: String): Tag =
|
fun of(value: String): Tag =
|
||||||
value.trim().let { v ->
|
value.trim().let { v ->
|
||||||
require(v.matches(validator)) {
|
require(v.matches(validator))
|
||||||
"\"$value\" isn't a valid tag"
|
|
||||||
}
|
|
||||||
Tag(v.lowercase())
|
Tag(v.lowercase())
|
||||||
}
|
}
|
||||||
|
|
||||||
fun parse(text: String?) =
|
fun parse(text: String?) =
|
||||||
text?.trim()?.split(splitter)?.map { of(it) } ?: emptyList()
|
text?.split(splitter)?.map { of(it) } ?: emptyList()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -52,16 +52,11 @@ private val tagSplitRegex = Regex("\\s+")
|
||||||
|
|
||||||
object TagAdapter {
|
object TagAdapter {
|
||||||
fun parse(ser: String?): List<Tag> {
|
fun parse(ser: String?): List<Tag> {
|
||||||
return ser?.trim()?.let {
|
return ser?.trim()?.let { if (it.isNotBlank()) it.split(tagSplitRegex).map { Tag.of(it) } else emptyList() }
|
||||||
if (it.isNotBlank()) it
|
|
||||||
.split(tagSplitRegex)
|
|
||||||
.filter { it.isNotBlank() }
|
|
||||||
.map { Tag.of(it) } else emptyList()
|
|
||||||
}
|
|
||||||
?: emptyList()
|
?: emptyList()
|
||||||
}
|
}
|
||||||
|
|
||||||
fun List<Tag>.serialize() = if (this.isEmpty()) "" else this.joinToString(" ") { it.value }
|
fun List<Tag>.serialize() = if (this.isEmpty()) "" else this.joinToString(",") { it.value }
|
||||||
}
|
}
|
||||||
|
|
||||||
fun List<Document>.grouped(): Map<Int, Map<Month, List<Document>>> =
|
fun List<Document>.grouped(): Map<Int, Map<Month, List<Document>>> =
|
||||||
|
|
|
@ -1,171 +0,0 @@
|
||||||
package net.h34t.filemure.classification
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.annotation.JsonProperty
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper
|
|
||||||
import com.fasterxml.jackson.module.kotlin.readValue
|
|
||||||
import okhttp3.MediaType.Companion.toMediaType
|
|
||||||
import okhttp3.OkHttpClient
|
|
||||||
import okhttp3.Request
|
|
||||||
import okhttp3.RequestBody
|
|
||||||
import okhttp3.RequestBody.Companion.toRequestBody
|
|
||||||
import java.util.*
|
|
||||||
|
|
||||||
/**
|
|
||||||
* examples for models:
|
|
||||||
* Vision: "meta-llama/llama-3.2-11b-vision-instruct"
|
|
||||||
* Text: "meta-llama/llama-3.2-3b-instruct"
|
|
||||||
*/
|
|
||||||
class AIClassifier(
|
|
||||||
val key: String,
|
|
||||||
val visionModel: String = "meta-llama/llama-3.2-90b-vision-instruct",
|
|
||||||
val textModel: String = "meta-llama/llama-3.3-70b-instruct",
|
|
||||||
val baseUrl: String = "https://openrouter.ai/api/v1/chat/completions"
|
|
||||||
) {
|
|
||||||
|
|
||||||
data class OpenrouterRequest(
|
|
||||||
val model: String,
|
|
||||||
val messages: List<ReqMessage>
|
|
||||||
)
|
|
||||||
|
|
||||||
interface ContentType
|
|
||||||
|
|
||||||
data class ReqMessage(
|
|
||||||
val role: String,
|
|
||||||
val content: List<ContentType>
|
|
||||||
)
|
|
||||||
|
|
||||||
data class TextContent(
|
|
||||||
val type: String = "text",
|
|
||||||
val text: String
|
|
||||||
) : ContentType
|
|
||||||
|
|
||||||
data class ImageContent(
|
|
||||||
val type: String = "image_url",
|
|
||||||
val image_url: ImageUrl
|
|
||||||
) : ContentType
|
|
||||||
|
|
||||||
data class ImageUrl(
|
|
||||||
val url: String
|
|
||||||
)
|
|
||||||
|
|
||||||
data class OpenrouterResponse(
|
|
||||||
@JsonProperty("id")
|
|
||||||
val id: String,
|
|
||||||
@JsonProperty("provider")
|
|
||||||
val provider: String,
|
|
||||||
@JsonProperty("model")
|
|
||||||
val model: String,
|
|
||||||
@JsonProperty("object")
|
|
||||||
val contentObject: String,
|
|
||||||
@JsonProperty("created")
|
|
||||||
val created: Long,
|
|
||||||
@JsonProperty("choices")
|
|
||||||
val choices: List<Choice>,
|
|
||||||
@JsonProperty("system_fingerprint")
|
|
||||||
val systemFingerprint: String?,
|
|
||||||
@JsonProperty("usage")
|
|
||||||
val usage: Usage
|
|
||||||
)
|
|
||||||
|
|
||||||
data class Choice(
|
|
||||||
@JsonProperty("logprobs")
|
|
||||||
val logprobs: Unit?,
|
|
||||||
@JsonProperty("finish_reason")
|
|
||||||
val finishReason: String?,
|
|
||||||
@JsonProperty("native_finish_reason")
|
|
||||||
val nativeFinishReason: String?,
|
|
||||||
@JsonProperty("index")
|
|
||||||
val index: Int,
|
|
||||||
@JsonProperty("message")
|
|
||||||
val message: ResMessage
|
|
||||||
)
|
|
||||||
|
|
||||||
data class ResMessage(
|
|
||||||
@JsonProperty("role")
|
|
||||||
val role: String,
|
|
||||||
@JsonProperty("content")
|
|
||||||
val content: String,
|
|
||||||
@JsonProperty("refusal")
|
|
||||||
val refusal: String?
|
|
||||||
)
|
|
||||||
|
|
||||||
data class Usage(
|
|
||||||
@JsonProperty("prompt_tokens")
|
|
||||||
val prompt_tokens: Int,
|
|
||||||
@JsonProperty("completion_tokens")
|
|
||||||
val completion_tokens: Int,
|
|
||||||
@JsonProperty("total_tokens")
|
|
||||||
val total_tokens: Int
|
|
||||||
)
|
|
||||||
|
|
||||||
private val encoder = Base64.getEncoder()
|
|
||||||
|
|
||||||
fun queryText(
|
|
||||||
query: String,
|
|
||||||
model: String = textModel,
|
|
||||||
): String {
|
|
||||||
return query(
|
|
||||||
model = model,
|
|
||||||
content = listOf(TextContent(text = query))
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
private fun query(content: List<ContentType>, model: String): String {
|
|
||||||
val orRequest = OpenrouterRequest(
|
|
||||||
model = model,
|
|
||||||
messages = listOf(
|
|
||||||
ReqMessage(
|
|
||||||
role = "user",
|
|
||||||
content = content
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
val mapper = ObjectMapper()
|
|
||||||
val bodyContent = mapper.writeValueAsString(orRequest)
|
|
||||||
|
|
||||||
val client = OkHttpClient()
|
|
||||||
|
|
||||||
val json = "application/json".toMediaType()
|
|
||||||
|
|
||||||
val body: RequestBody = bodyContent.toRequestBody(json)
|
|
||||||
val req = Request.Builder()
|
|
||||||
.url(baseUrl)
|
|
||||||
.header("Authorization", "Bearer $key")
|
|
||||||
.header("Content-Type", "application/json")
|
|
||||||
.post(body)
|
|
||||||
.build()
|
|
||||||
|
|
||||||
client.newCall(req).execute().use { res ->
|
|
||||||
val bodyString = res.body?.string()?.trim()
|
|
||||||
val responseBody = bodyString?.let { mapper.readValue<OpenrouterResponse>(it) }
|
|
||||||
?: throw Exception("body is null")
|
|
||||||
|
|
||||||
return responseBody.choices.joinToString("\n") { it.message.content }.trim()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fun classifyImage(
|
|
||||||
prompt: String,
|
|
||||||
contentType: String,
|
|
||||||
data: ByteArray,
|
|
||||||
model: String = visionModel
|
|
||||||
): String {
|
|
||||||
val dataUrl = "data:$contentType;base64," + encoder.encodeToString(data)
|
|
||||||
|
|
||||||
return query(
|
|
||||||
model = model,
|
|
||||||
content = listOf(
|
|
||||||
TextContent(
|
|
||||||
type = "text",
|
|
||||||
// text = "Extract the message text only in this file and repeat it word-for-word."
|
|
||||||
text = prompt
|
|
||||||
),
|
|
||||||
ImageContent(
|
|
||||||
type = "image_url",
|
|
||||||
image_url = ImageUrl(url = dataUrl)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -72,8 +72,7 @@ class DocumentController(val modifiers: TemplateModifiers, val repository: Sqlit
|
||||||
val referenceDate = referenceDates.firstOrNull()
|
val referenceDate = referenceDates.firstOrNull()
|
||||||
?: LocalDateTime.now()
|
?: LocalDateTime.now()
|
||||||
val tags = selectedFiles.map { File(it.filename).extension }.distinct().asSequence()
|
val tags = selectedFiles.map { File(it.filename).extension }.distinct().asSequence()
|
||||||
|
val description = ""
|
||||||
val description = selectedFiles.mapNotNull { it.contentExtracted }.joinToString("\n\n")
|
|
||||||
|
|
||||||
ctx.tempolin(
|
ctx.tempolin(
|
||||||
Frame(
|
Frame(
|
||||||
|
|
|
@ -6,11 +6,9 @@ import net.h34t.filemure.TemplateModifiers
|
||||||
import net.h34t.filemure.repository.SqliteRepository
|
import net.h34t.filemure.repository.SqliteRepository
|
||||||
import net.h34t.filemure.requireSession
|
import net.h34t.filemure.requireSession
|
||||||
|
|
||||||
class UploadController(
|
class UploadController(val modifiers: TemplateModifiers, val repository: SqliteRepository) {
|
||||||
private val modifiers: TemplateModifiers,
|
|
||||||
private val repository: SqliteRepository,
|
private val pdfContentExtractor = ContentExtractor()
|
||||||
private val contentExtractor: ContentExtractor
|
|
||||||
) {
|
|
||||||
|
|
||||||
fun upload(ctx: Context) {
|
fun upload(ctx: Context) {
|
||||||
val session = ctx.requireSession()
|
val session = ctx.requireSession()
|
||||||
|
@ -27,8 +25,8 @@ class UploadController(
|
||||||
val contentType = it.contentType()
|
val contentType = it.contentType()
|
||||||
|
|
||||||
val contentExtracted = when (contentType) {
|
val contentExtracted = when (contentType) {
|
||||||
"application/pdf" -> contentExtractor.extractPdf(content)
|
"application/pdf" -> pdfContentExtractor.extractPdf(content)
|
||||||
"text/plain" -> contentExtractor.extractPlain(content)
|
"text/plain" -> pdfContentExtractor.extractPlain(content)
|
||||||
else -> ""
|
else -> ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,7 @@
|
||||||
<div class="space"></div>
|
<div class="space"></div>
|
||||||
|
|
||||||
<b>Description</b>
|
<b>Description</b>
|
||||||
<pre>{$description|html}</pre>
|
<div>{*$description}</div>
|
||||||
|
|
||||||
</fieldset>
|
</fieldset>
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue