221 lines
7.3 KiB
Kotlin
221 lines
7.3 KiB
Kotlin
package com.example.notesai.util
|
|
|
|
import android.content.Context
|
|
import android.net.Uri
|
|
import android.util.Log
|
|
import com.tom_roush.pdfbox.android.PDFBoxResourceLoader
|
|
import com.tom_roush.pdfbox.pdmodel.PDDocument
|
|
import com.tom_roush.pdfbox.text.PDFTextStripper
|
|
import java.io.BufferedReader
|
|
import java.io.InputStreamReader
|
|
|
|
object FileParser {
|
|
|
|
private const val TAG = "FileParser"
|
|
|
|
/**
|
|
* Initialize PDFBox (call this in Application.onCreate or before first use)
|
|
*/
|
|
fun initPDFBox(context: Context) {
|
|
try {
|
|
PDFBoxResourceLoader.init(context)
|
|
Log.d(TAG, "PDFBox initialized successfully")
|
|
} catch (e: Exception) {
|
|
Log.e(TAG, "Failed to initialize PDFBox", e)
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parse file berdasarkan tipe
|
|
*/
|
|
suspend fun parseFile(context: Context, uri: Uri): FileParseResult {
|
|
return try {
|
|
val mimeType = context.contentResolver.getType(uri)
|
|
val fileName = getFileName(context, uri)
|
|
|
|
Log.d(TAG, "Parsing file: $fileName, type: $mimeType")
|
|
|
|
val content = when {
|
|
mimeType == "application/pdf" || fileName.endsWith(".pdf", ignoreCase = true) -> {
|
|
parsePDF(context, uri)
|
|
}
|
|
mimeType == "text/plain" || fileName.endsWith(".txt", ignoreCase = true) -> {
|
|
parseTXT(context, uri)
|
|
}
|
|
mimeType == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
|| fileName.endsWith(".docx", ignoreCase = true) -> {
|
|
parseDOCX(context, uri)
|
|
}
|
|
else -> {
|
|
return FileParseResult.Error("Format file tidak didukung: $mimeType")
|
|
}
|
|
}
|
|
|
|
if (content.isBlank()) {
|
|
FileParseResult.Error("File kosong atau tidak dapat dibaca")
|
|
} else {
|
|
FileParseResult.Success(
|
|
content = content,
|
|
fileName = fileName,
|
|
fileType = getFileType(fileName),
|
|
wordCount = content.split(Regex("\\s+")).size
|
|
)
|
|
}
|
|
} catch (e: Exception) {
|
|
Log.e(TAG, "Error parsing file", e)
|
|
FileParseResult.Error("Gagal membaca file: ${e.message}")
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parse PDF file
|
|
*/
|
|
private fun parsePDF(context: Context, uri: Uri): String {
|
|
val inputStream = context.contentResolver.openInputStream(uri)
|
|
?: throw Exception("Cannot open file")
|
|
|
|
return inputStream.use { stream ->
|
|
val document = PDDocument.load(stream)
|
|
val stripper = PDFTextStripper()
|
|
val text = stripper.getText(document)
|
|
document.close()
|
|
text.trim()
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parse TXT file
|
|
*/
|
|
private fun parseTXT(context: Context, uri: Uri): String {
|
|
val inputStream = context.contentResolver.openInputStream(uri)
|
|
?: throw Exception("Cannot open file")
|
|
|
|
return inputStream.use { stream ->
|
|
BufferedReader(InputStreamReader(stream, Charsets.UTF_8))
|
|
.readText()
|
|
.trim()
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parse DOCX file - SIMPLIFIED VERSION
|
|
* Hanya extract text mentah dari XML
|
|
*/
|
|
private fun parseDOCX(context: Context, uri: Uri): String {
|
|
val inputStream = context.contentResolver.openInputStream(uri)
|
|
?: throw Exception("Cannot open file")
|
|
|
|
return inputStream.use { stream ->
|
|
try {
|
|
// DOCX adalah ZIP file, kita extract document.xml
|
|
val zipInputStream = java.util.zip.ZipInputStream(stream)
|
|
val text = StringBuilder()
|
|
|
|
var entry = zipInputStream.nextEntry
|
|
while (entry != null) {
|
|
if (entry.name == "word/document.xml") {
|
|
val xmlContent = zipInputStream.bufferedReader().readText()
|
|
|
|
// Extract text dari XML tags
|
|
// Format: <w:t>text here</w:t>
|
|
val textPattern = Regex("<w:t[^>]*>([^<]+)</w:t>")
|
|
textPattern.findAll(xmlContent).forEach { match ->
|
|
text.append(match.groupValues[1])
|
|
text.append(" ")
|
|
}
|
|
|
|
// Extract text dari paragraph tags
|
|
val paraPattern = Regex("<w:p[^>]*>(.*?)</w:p>", RegexOption.DOT_MATCHES_ALL)
|
|
paraPattern.findAll(xmlContent).forEach { match ->
|
|
val paraContent = match.groupValues[1]
|
|
val textInPara = Regex("<w:t[^>]*>([^<]+)</w:t>")
|
|
textInPara.findAll(paraContent).forEach { textMatch ->
|
|
text.append(textMatch.groupValues[1])
|
|
text.append(" ")
|
|
}
|
|
text.append("\n")
|
|
}
|
|
|
|
break
|
|
}
|
|
entry = zipInputStream.nextEntry
|
|
}
|
|
|
|
zipInputStream.close()
|
|
text.toString().trim()
|
|
} catch (e: Exception) {
|
|
Log.e(TAG, "Error parsing DOCX", e)
|
|
throw Exception("Gagal membaca file DOCX: ${e.message}")
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get file name from URI
|
|
*/
|
|
private fun getFileName(context: Context, uri: Uri): String {
|
|
var fileName = "unknown"
|
|
|
|
context.contentResolver.query(uri, null, null, null, null)?.use { cursor ->
|
|
val nameIndex = cursor.getColumnIndex(android.provider.OpenableColumns.DISPLAY_NAME)
|
|
if (cursor.moveToFirst() && nameIndex != -1) {
|
|
fileName = cursor.getString(nameIndex)
|
|
}
|
|
}
|
|
|
|
return fileName
|
|
}
|
|
|
|
/**
|
|
* Get file type display name
|
|
*/
|
|
private fun getFileType(fileName: String): String {
|
|
return when {
|
|
fileName.endsWith(".pdf", ignoreCase = true) -> "PDF"
|
|
fileName.endsWith(".txt", ignoreCase = true) -> "Text"
|
|
fileName.endsWith(".docx", ignoreCase = true) -> "Word"
|
|
else -> "Unknown"
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get file size
|
|
*/
|
|
fun getFileSize(context: Context, uri: Uri): Long {
|
|
var size = 0L
|
|
|
|
context.contentResolver.query(uri, null, null, null, null)?.use { cursor ->
|
|
val sizeIndex = cursor.getColumnIndex(android.provider.OpenableColumns.SIZE)
|
|
if (cursor.moveToFirst() && sizeIndex != -1) {
|
|
size = cursor.getLong(sizeIndex)
|
|
}
|
|
}
|
|
|
|
return size
|
|
}
|
|
|
|
/**
|
|
* Format file size untuk display
|
|
*/
|
|
fun formatFileSize(bytes: Long): String {
|
|
return when {
|
|
bytes < 1024 -> "$bytes B"
|
|
bytes < 1024 * 1024 -> "${bytes / 1024} KB"
|
|
else -> "${bytes / (1024 * 1024)} MB"
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Result dari parsing file
|
|
*/
|
|
sealed class FileParseResult {
|
|
data class Success(
|
|
val content: String,
|
|
val fileName: String,
|
|
val fileType: String,
|
|
val wordCount: Int
|
|
) : FileParseResult()
|
|
|
|
data class Error(val message: String) : FileParseResult()
|
|
} |