Skip to content

Commit dfc9553

Browse files
committed
Initial commit
0 parents  commit dfc9553

11 files changed

+1044
-0
lines changed

.gitignore

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Python-generated files
2+
__pycache__/
3+
*.py[oc]
4+
build/
5+
dist/
6+
wheels/
7+
*.egg-info
8+
9+
# Virtual environments
10+
.venv

.python-version

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.12

README-DEV.md

+71
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# pasteurize development notes
2+
3+
Internal development notes and goals for the pasteurize library. This document is for developer reference only.
4+
5+
## Core functionality
6+
7+
The library takes a source directory and produces a markdown representation of all relevant code files, either copying to clipboard or saving to a file.
8+
9+
### Key features
10+
11+
- Scan directory recursively for code files
12+
- Smart filtering using .gitignore if available
13+
- CLI interface with typer/rich for great UX
14+
- Configurable file extension inclusion/exclusion
15+
- Output to clipboard (default) or file
16+
- XML-style markdown formatting
17+
18+
### Output format
19+
20+
Each file will be formatted in markdown like:
21+
22+
```xml
23+
<file path="path/to/file.py" language="python">
24+
def example():
25+
pass
26+
</file>
27+
```
28+
29+
## Technical decisions
30+
31+
### Dependencies
32+
33+
- typer - CLI interface
34+
- rich - Terminal formatting
35+
- pyperclip - Clipboard interaction
36+
- pathspec - .gitignore parsing
37+
- tomli - pyproject.toml parsing (if needed)
38+
39+
### Project structure
40+
41+
```
42+
pasteurize/
43+
├── pyproject.toml # Project metadata and dependencies
44+
├── src/
45+
│ └── pasteurize/
46+
│ ├── __init__.py
47+
│ ├── cli.py # Typer CLI implementation
48+
│ ├── core.py # Core scanning/processing logic
49+
│ └── format.py # Markdown formatting utilities
50+
└── tests/
51+
└── test_*.py # Test files
52+
```
53+
54+
### Development roadmap
55+
56+
1. Basic file scanning with extension filtering
57+
2. Gitignore integration
58+
3. Markdown formatting
59+
4. Clipboard/file output handling
60+
5. CLI implementation
61+
6. Testing and documentation
62+
63+
### Notes
64+
65+
- Keep the core logic separate from CLI for better testing
66+
- Use pathlib for cross-platform path handling
67+
- Consider adding a config file option later for project-specific settings
68+
- May want to add file size limits and warnings
69+
- Consider adding syntax highlighting hints in markdown
70+
- Add support for detecting file language based on extension
71+
- Consider adding a way to exclude specific files/patterns beyond gitignore

README.md

Whitespace-only changes.

pyproject.toml

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
[project]
2+
name = "pasteurize"
3+
version = "0.1.0"
4+
description = "Convert source code directories into markdown for LLM context"
5+
authors = []
6+
dependencies = [
7+
"typer>=0.9.0",
8+
"rich>=13.7.0",
9+
"pyperclip>=1.8.2",
10+
"pathspec>=0.12.1",
11+
]
12+
requires-python = ">=3.9"
13+
readme = "README.md"
14+
license = { text = "MIT" }
15+
16+
[project.scripts]
17+
pasteurize = "pasteurize.cli:app"
18+
19+
[build-system]
20+
requires = ["hatchling"]
21+
build-backend = "hatchling.build"
22+
23+
[tool.rye]
24+
managed = true
25+
dev-dependencies = ["pytest>=8.0.0", "black>=24.2.0", "ruff>=0.2.1"]
26+
27+
[tool.ruff]
28+
line-length = 88
29+
target-version = "py39"

src/pasteurize/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
"""Convert source code directories into markdown for LLM context."""
2+
3+
__version__ = "0.1.0"

src/pasteurize/cli.py

+120
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
import typer
2+
from pathlib import Path
3+
from typing import Optional, List
4+
from rich.console import Console
5+
import pyperclip
6+
7+
from .core import scan_directory
8+
from .format import format_files
9+
from .patterns import DEFAULT_EXTENSIONS
10+
11+
app = typer.Typer(
12+
no_args_is_help=True, # Show help when no args provided
13+
add_completion=False, # Disable shell completion for simplicity
14+
)
15+
console = Console()
16+
17+
18+
@app.command()
19+
def main(
20+
source: list[Path] = typer.Argument(
21+
...,
22+
help="Source directories to process",
23+
exists=True,
24+
file_okay=False,
25+
dir_okay=True,
26+
),
27+
outfile: Optional[Path] = typer.Option(
28+
None,
29+
"--out",
30+
"-o",
31+
help="Output file path (defaults to clipboard)",
32+
),
33+
include: Optional[str] = typer.Option(
34+
None,
35+
"--include",
36+
"-i",
37+
help="Extensions to include (comma-separated, e.g. 'py,js,ts'). "
38+
"If not specified, uses default extensions.",
39+
),
40+
exclude: Optional[List[str]] = typer.Option(
41+
None,
42+
"--exclude",
43+
"-x",
44+
help="Glob patterns to exclude (can be used multiple times). Examples:\n"
45+
"--exclude '*.test.js' = exclude test files\n"
46+
"--exclude '**/tests/**' = exclude tests directories\n"
47+
"--exclude 'src/legacy/**' = exclude legacy directory\n"
48+
"--exclude '*.min.js' = exclude minified JS",
49+
),
50+
verbose: bool = typer.Option(
51+
False,
52+
"--verbose",
53+
"-v",
54+
help="Show verbose output",
55+
),
56+
) -> None:
57+
"""Convert source code files to markdown format for LLM context."""
58+
try:
59+
if verbose:
60+
console.print(f"Scanning directories: {', '.join(str(s) for s in source)}")
61+
62+
if include:
63+
console.print(f"Including extensions: {include}")
64+
else:
65+
console.print(
66+
f"Using default extensions: {','.join(sorted(DEFAULT_EXTENSIONS))}"
67+
)
68+
69+
if exclude:
70+
console.print(f"Exclude patterns: {', '.join(exclude)}")
71+
72+
# Process include extensions - if specified, use only those
73+
include_exts = include.split(",") if include else list(DEFAULT_EXTENSIONS)
74+
75+
# Scan all directories and combine results
76+
all_files = []
77+
for directory in source:
78+
files = scan_directory(
79+
directory,
80+
include=include_exts,
81+
extra_patterns=exclude,
82+
verbose=verbose,
83+
)
84+
all_files.extend(files)
85+
86+
# Remove any duplicates (in case of overlapping directories)
87+
all_files = sorted(set(all_files))
88+
89+
if verbose:
90+
console.print(f"Found {len(all_files)} unique files")
91+
for f in all_files:
92+
console.print(f" {f}")
93+
94+
if not all_files:
95+
console.print("[yellow]No files found matching criteria[/]")
96+
raise typer.Exit(0)
97+
98+
try:
99+
markdown = format_files(all_files, verbose=verbose)
100+
except Exception as e:
101+
console.print(f"[red]Error during formatting:[/] {str(e)}")
102+
raise typer.Exit(1)
103+
104+
try:
105+
if outfile:
106+
outfile.write_text(markdown)
107+
console.print(f"Output written to [green]{outfile}[/]")
108+
else:
109+
pyperclip.copy(markdown)
110+
console.print(
111+
f"[green]Copied[/] {len(all_files)} files to clipboard "
112+
f"({len(markdown)} characters)"
113+
)
114+
except Exception as e:
115+
console.print(f"[red]Error during output:[/] {str(e)}")
116+
raise typer.Exit(1)
117+
118+
except Exception as e:
119+
console.print(f"[red]Error:[/] {str(e)}")
120+
raise typer.Exit(1)

src/pasteurize/core.py

+143
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
from pathlib import Path
2+
from typing import Optional, Sequence
3+
import pathspec
4+
import sys
5+
6+
from .patterns import DEFAULT_EXTENSIONS, EXCLUDED_DIRS, EXCLUDED_PATTERNS
7+
8+
9+
def find_gitignore(start_path: Path) -> Optional[Path]:
10+
"""Search for .gitignore file in current and parent directories."""
11+
print(f"Searching for gitignore from: {start_path}", file=sys.stderr)
12+
current = start_path.absolute()
13+
while current != current.parent:
14+
gitignore = current / ".gitignore"
15+
if gitignore.is_file():
16+
print(f"Found gitignore at: {gitignore}", file=sys.stderr)
17+
return gitignore
18+
current = current.parent
19+
print("No gitignore found", file=sys.stderr)
20+
return None
21+
22+
23+
def get_gitignore_spec(
24+
path: Path, extra_patterns: Optional[list[str]] = None, verbose: bool = False
25+
) -> pathspec.PathSpec:
26+
"""Load .gitignore patterns and combine with our default exclusions."""
27+
if verbose:
28+
print(f"Getting gitignore spec for: {path}", file=sys.stderr)
29+
30+
patterns = list(EXCLUDED_PATTERNS)
31+
if verbose:
32+
print(f"Added {len(EXCLUDED_PATTERNS)} default patterns", file=sys.stderr)
33+
34+
# Add directory exclusions
35+
dir_patterns = [f"{d}/" for d in EXCLUDED_DIRS]
36+
patterns.extend(dir_patterns)
37+
if verbose:
38+
print(f"Added {len(dir_patterns)} directory exclusions", file=sys.stderr)
39+
40+
# Add any extra patterns provided
41+
if extra_patterns:
42+
patterns.extend(extra_patterns)
43+
if verbose:
44+
print(f"Added {len(extra_patterns)} extra patterns", file=sys.stderr)
45+
46+
# Add patterns from .gitignore if found
47+
gitignore_path = find_gitignore(path) if verbose else None
48+
if gitignore_path:
49+
with open(gitignore_path) as f:
50+
gitignore_patterns = [
51+
line.strip() for line in f if line.strip() and not line.startswith("#")
52+
]
53+
patterns.extend(gitignore_patterns)
54+
if verbose:
55+
print(
56+
f"Added {len(gitignore_patterns)} patterns from gitignore",
57+
file=sys.stderr,
58+
)
59+
60+
if verbose:
61+
print(f"Total patterns: {len(patterns)}", file=sys.stderr)
62+
return pathspec.PathSpec.from_lines("gitwildmatch", patterns)
63+
64+
65+
def scan_directory(
66+
path: Path,
67+
include: Optional[Sequence[str]] = None,
68+
extra_patterns: Optional[list[str]] = None,
69+
verbose: bool = False,
70+
) -> list[Path]:
71+
"""
72+
Scan directory for relevant files.
73+
74+
Args:
75+
path: Directory to scan
76+
include: File extensions to include (without dots)
77+
extra_patterns: Additional gitignore-style patterns to exclude
78+
verbose: Whether to print debug information
79+
80+
Returns:
81+
List of paths to relevant files
82+
"""
83+
if verbose:
84+
print(f"\nScanning directory: {path}", file=sys.stderr)
85+
86+
if not path.is_dir():
87+
raise ValueError(f"Path {path} is not a directory")
88+
89+
# Use provided extensions or defaults
90+
include_set = {f".{ext.lstrip('.')}" for ext in (include or DEFAULT_EXTENSIONS)}
91+
92+
if verbose:
93+
print(f"Include extensions: {include_set}", file=sys.stderr)
94+
95+
# Get combined gitignore and default exclusions
96+
spec = get_gitignore_spec(path, extra_patterns, verbose)
97+
98+
result = []
99+
processed = 0
100+
skipped = 0
101+
102+
if verbose:
103+
print("\nStarting file scan...", file=sys.stderr)
104+
105+
for file_path in path.rglob("*"):
106+
processed += 1
107+
if verbose and processed % 100 == 0:
108+
print(
109+
f"Processed {processed} files, found {len(result)}, skipped {skipped}...",
110+
file=sys.stderr,
111+
)
112+
113+
# Skip non-files
114+
if not file_path.is_file():
115+
skipped += 1
116+
continue
117+
118+
# Get relative path for pattern matching
119+
try:
120+
rel_path = file_path.relative_to(path)
121+
except ValueError:
122+
skipped += 1
123+
continue
124+
125+
# Skip excluded patterns
126+
if spec.match_file(str(rel_path)):
127+
skipped += 1
128+
continue
129+
130+
# Apply extension filters
131+
ext = file_path.suffix.lower()
132+
if ext not in include_set:
133+
skipped += 1
134+
continue
135+
136+
result.append(file_path)
137+
138+
if verbose:
139+
print(
140+
f"\nScan complete: processed {processed} files, found {len(result)}, skipped {skipped}",
141+
file=sys.stderr,
142+
)
143+
return sorted(result)

0 commit comments

Comments
 (0)