Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion core/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ group = 'com.opencontext'
version = '1.0.0'

java {
sourceCompatibility = '21'
sourceCompatibility = '17'
}

configurations {
Expand Down
304 changes: 300 additions & 4 deletions core/src/main/java/com/opencontext/service/ChunkingService.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ public class ChunkingService {

private static final int MAX_CHUNK_SIZE = 1000; // Maximum chunk size (number of characters)
private static final int CHUNK_OVERLAP = 200; // Overlap size between chunks
private static final int MIN_CODE_CHUNK_SIZE = 100; // Minimum chunk size for code to prevent tiny fragments
private static final int MAX_CODE_CHUNK_SIZE = 2000; // Larger max size for code to preserve semantic units

/**
* Converts parsed elements into structured chunks.
Expand Down Expand Up @@ -78,10 +80,19 @@ public List<StructuredChunk> createChunks(UUID documentId, List<Map<String, Obje
// ν˜„μž¬ 계측 κ΅¬μ‘°μ—μ„œ λ‚΄μš© 청크 생성
ChunkContext currentContext = hierarchyStack.isEmpty() ? null : hierarchyStack.peek();

// κΈ΄ ν…μŠ€νŠΈλŠ” μ—¬λŸ¬ 청크둜 λΆ„ν• 
List<String> textChunks = splitLongText(text);
for (String textChunk : textChunks) {
chunks.add(createChunk(documentId, chunkIndex++, textChunk, currentContext, element));
// Check if this is code content
if (isCodeContent(text, element)) {
// Use code-aware chunking
List<String> codeChunks = splitCodeText(text, element);
for (String codeChunk : codeChunks) {
chunks.add(createChunk(documentId, chunkIndex++, codeChunk, currentContext, element));
}
} else {
// κΈ΄ ν…μŠ€νŠΈλŠ” μ—¬λŸ¬ 청크둜 λΆ„ν• 
List<String> textChunks = splitLongText(text);
for (String textChunk : textChunks) {
chunks.add(createChunk(documentId, chunkIndex++, textChunk, currentContext, element));
}
}
}
default -> {
Expand Down Expand Up @@ -142,6 +153,291 @@ private String generateChunkId(UUID documentId, int chunkIndex) {
return documentId.toString() + "-chunk-" + chunkIndex;
}

/**
* Determines if the given text content is code based on file extension or content patterns.
*/
private boolean isCodeContent(String text, Map<String, Object> element) {
// Check metadata for file information
@SuppressWarnings("unchecked")
Map<String, Object> metadata = (Map<String, Object>) element.get("metadata");

if (metadata != null) {
// Check for filename with code extensions
String filename = (String) metadata.get("filename");
if (filename != null && isCodeFileExtension(filename)) {
log.debug("πŸ“ [CHUNKING] Detected code content from filename: {}", filename);
return true;
}

// Check for explicit language metadata
String language = (String) metadata.get("language");
if (language != null && isCodeLanguage(language)) {
log.debug("πŸ“ [CHUNKING] Detected code content from language metadata: {}", language);
return true;
}
}

// Check content patterns for code indicators
if (hasCodePatterns(text)) {
log.debug("πŸ“ [CHUNKING] Detected code content from content patterns");
return true;
}

return false;
}

/**
* Checks if a filename has a code file extension.
*/
private boolean isCodeFileExtension(String filename) {
String extension = getFileExtension(filename).toLowerCase();
return switch (extension) {
case "java", "py", "js", "ts", "cpp", "c", "h", "hpp", "go", "rs",
"rb", "php", "cs", "swift", "kt", "scala", "sh", "sql", "r",
"m", "mm", "pl", "pm", "lua", "dart", "jl", "f90", "f95",
"f03", "f08", "pas", "pp", "inc" -> true;
default -> false;
};
}

/**
* Checks if a language string indicates code content.
*/
private boolean isCodeLanguage(String language) {
String lang = language.toLowerCase();
return switch (lang) {
case "java", "python", "javascript", "typescript", "c++", "c", "go",
"rust", "ruby", "php", "csharp", "c#", "swift", "kotlin",
"scala", "shell", "bash", "sql", "r", "objective-c", "perl",
"lua", "dart", "julia", "fortran", "pascal" -> true;
default -> false;
};
}

/**
* Detects code patterns in text content.
*/
private boolean hasCodePatterns(String text) {
// Common code patterns across languages
String[] codePatterns = {
// Function/method declarations
"public\\s+\\w+\\s+\\w+\\s*\\(", // Java public methods
"private\\s+\\w+\\s+\\w+\\s*\\(", // Java private methods
"def\\s+\\w+\\s*\\(", // Python functions
"function\\s+\\w+\\s*\\(", // JavaScript functions
"\\w+\\s*\\(.*\\)\\s*\\{", // General function with braces

// Class declarations
"class\\s+\\w+", // Class definitions
"public\\s+class\\s+\\w+", // Java class

// Import/include statements
"import\\s+[\\w\\.]+", // Java/Python imports
"#include\\s*<[^>]+>", // C/C++ includes
"using\\s+\\w+", // C# using

// Control structures
"if\\s*\\([^)]+\\)\\s*\\{", // If statements with braces
"for\\s*\\([^)]+\\)\\s*\\{", // For loops with braces
"while\\s*\\([^)]+\\)\\s*\\{", // While loops with braces

// Variable declarations with types
"\\w+\\s+\\w+\\s*=", // Type variable = value
"\\w+\\[\\]\\s+\\w+", // Array declarations

// Common programming punctuation patterns
"\\{[^}]*\\}", // Code blocks
";$", // Statement endings
"\\->|=>", // Arrow functions/lambdas
};

for (String pattern : codePatterns) {
if (text.matches(".*" + pattern + ".*")) {
return true;
}
}

// Check for high density of programming characters
long programmingChars = text.chars()
.filter(c -> c == '{' || c == '}' || c == '(' || c == ')' ||
c == '[' || c == ']' || c == ';' || c == '=' || c == ':')
.count();

double programmingCharRatio = (double) programmingChars / text.length();
return programmingCharRatio > 0.05; // 5% threshold for programming character density
}

/**
* Extracts file extension from filename.
*/
private String getFileExtension(String filename) {
int lastDot = filename.lastIndexOf('.');
return lastDot > 0 ? filename.substring(lastDot + 1) : "";
}

/**
* Splits code text while preserving semantic units and syntactic completeness.
*/
private List<String> splitCodeText(String text, Map<String, Object> element) {
// If the code is already within acceptable size limits, return as is
if (text.length() <= MAX_CODE_CHUNK_SIZE) {
return List.of(text);
}

log.debug("πŸ“ [CHUNKING] Splitting large code text: {} characters", text.length());

// Try to split by logical code boundaries
List<String> chunks = splitByCodeBoundaries(text);

// If chunks are still too large, use enhanced text splitting with code awareness
List<String> finalChunks = new ArrayList<>();
for (String chunk : chunks) {
if (chunk.length() <= MAX_CODE_CHUNK_SIZE) {
finalChunks.add(chunk);
} else {
finalChunks.addAll(splitLongCodeChunk(chunk));
}
}

// Filter out chunks that are too small unless they're complete statements
return finalChunks.stream()
.filter(chunk -> chunk.length() >= MIN_CODE_CHUNK_SIZE || isCompleteStatement(chunk))
.collect(java.util.stream.Collectors.toList());
}

/**
* Splits code by logical boundaries like method definitions, class boundaries, etc.
*/
private List<String> splitByCodeBoundaries(String text) {
List<String> chunks = new ArrayList<>();
String[] lines = text.split("\n");
StringBuilder currentChunk = new StringBuilder();
int braceLevel = 0;
boolean inFunction = false;
boolean inClass = false;

for (String line : lines) {
String trimmedLine = line.trim();

// Track brace levels for balanced splitting
for (char c : line.toCharArray()) {
if (c == '{') braceLevel++;
else if (c == '}') braceLevel--;
}

// Detect function/method starts
if (isMethodDeclaration(trimmedLine)) {
// If we have accumulated content and we're starting a new method, finalize current chunk
if (currentChunk.length() > 0 && braceLevel <= 1) {
chunks.add(currentChunk.toString().trim());
currentChunk = new StringBuilder();
}
inFunction = true;
}

// Detect class declarations
if (isClassDeclaration(trimmedLine)) {
if (currentChunk.length() > 0 && braceLevel <= 1) {
chunks.add(currentChunk.toString().trim());
currentChunk = new StringBuilder();
}
inClass = true;
}

currentChunk.append(line).append("\n");

// If we've closed all braces at a function/class level, consider ending the chunk
if (braceLevel == 0 && (inFunction || inClass) && currentChunk.length() > MIN_CODE_CHUNK_SIZE) {
chunks.add(currentChunk.toString().trim());
currentChunk = new StringBuilder();
inFunction = false;
inClass = false;
}

// Prevent chunks from becoming too large
if (currentChunk.length() > MAX_CODE_CHUNK_SIZE) {
chunks.add(currentChunk.toString().trim());
currentChunk = new StringBuilder();
inFunction = false;
inClass = false;
}
}

// Add any remaining content
if (currentChunk.length() > 0) {
chunks.add(currentChunk.toString().trim());
}

return chunks.isEmpty() ? List.of(text) : chunks;
}

/**
* Checks if a line contains a method/function declaration.
*/
private boolean isMethodDeclaration(String line) {
return line.matches(".*\\b(public|private|protected|static|def|function)\\b.*\\(.*\\).*\\{?.*") ||
line.matches(".*\\w+\\s*\\([^)]*\\)\\s*\\{?.*");
}

/**
* Checks if a line contains a class declaration.
*/
private boolean isClassDeclaration(String line) {
return line.matches(".*\\b(class|interface|enum)\\s+\\w+.*");
}

/**
* Splits a large code chunk using enhanced text splitting with code awareness.
*/
private List<String> splitLongCodeChunk(String text) {
List<String> chunks = new ArrayList<>();
String[] lines = text.split("\n");
StringBuilder currentChunk = new StringBuilder();

for (String line : lines) {
// If adding this line would exceed the limit, finalize current chunk
if (currentChunk.length() + line.length() + 1 > MAX_CODE_CHUNK_SIZE) {
if (currentChunk.length() > 0) {
chunks.add(currentChunk.toString().trim());
currentChunk = new StringBuilder();
}
}

currentChunk.append(line).append("\n");
}

if (currentChunk.length() > 0) {
chunks.add(currentChunk.toString().trim());
}

return chunks.isEmpty() ? List.of(text) : chunks;
}

/**
* Checks if a code chunk represents a complete statement.
*/
private boolean isCompleteStatement(String chunk) {
String trimmed = chunk.trim();

// Complete statements often end with specific characters
if (trimmed.endsWith(";") || trimmed.endsWith("}") || trimmed.endsWith(":")) {
return true;
}

// Import statements are usually complete
if (trimmed.startsWith("import ") || trimmed.startsWith("#include") ||
trimmed.startsWith("using ") || trimmed.startsWith("from ")) {
return true;
}

// Single-line function calls or assignments
if (trimmed.contains("=") && !trimmed.contains("{") && trimmed.length() < 200) {
return true;
}

return false;
}

/**
* ν—€λ”μ˜ λ ˆλ²¨μ„ κ²°μ •ν•©λ‹ˆλ‹€.
*/
Expand Down
Loading