Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 63 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,64 @@
# IDE files
.idea
__pycache__
.vscode/
*.swp
*.swo
*~

# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg

# Virtual environments
venv/
.venv/
ENV/
env/
virtualenv/

# Testing
.pytest_cache/
.coverage
htmlcov/
coverage.xml
*.cover
.hypothesis/
.tox/
.nox/

# Claude
.claude/*

# Project specific
*/queue.txt
*/crawled.txt

# OS files
.DS_Store
Thumbs.db

# Logs
*.log

# Package manager
# Note: Do not ignore poetry.lock or uv.lock
pip-log.txt
pip-delete-this-directory.txt
282 changes: 282 additions & 0 deletions poetry.lock

Large diffs are not rendered by default.

89 changes: 89 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
[tool.poetry]
name = "website-crawler"
version = "0.1.0"
description = "A multi-threaded web crawler for gathering links"
authors = ["Your Name <your.email@example.com>"]
readme = "README.md"
packages = []

[tool.poetry.dependencies]
python = "^3.8"

[tool.poetry.group.dev.dependencies]
pytest = "^8.0.0"
pytest-cov = "^5.0.0"
pytest-mock = "^3.14.0"

[tool.poetry.scripts]
test = "pytest"
tests = "pytest"

[tool.pytest.ini_options]
minversion = "8.0"
testpaths = ["tests"]
python_files = ["test_*.py", "*_test.py"]
python_classes = ["Test*"]
python_functions = ["test_*"]
addopts = [
"--strict-markers",
"--strict-config",
"--verbose",
"--cov=.",
"--cov-branch",
"--cov-report=term-missing:skip-covered",
"--cov-report=html:htmlcov",
"--cov-report=xml:coverage.xml",
"--cov-fail-under=80",
]
markers = [
"unit: Unit tests",
"integration: Integration tests",
"slow: Slow tests",
]
filterwarnings = [
"error",
"ignore::UserWarning",
"ignore::DeprecationWarning",
]

[tool.coverage.run]
source = ["."]
omit = [
"*/tests/*",
"*/test_*",
"*/__pycache__/*",
"*/venv/*",
"*/.venv/*",
"*/virtualenv/*",
"*/setup.py",
"*/conftest.py",
"*/.pytest_cache/*",
"*/htmlcov/*",
]

[tool.coverage.report]
exclude_lines = [
"pragma: no cover",
"def __repr__",
"def __str__",
"raise AssertionError",
"raise NotImplementedError",
"if __name__ == .__main__.:",
"if TYPE_CHECKING:",
"class .*\\bProtocol\\):",
"@(abc\\.)?abstractmethod",
]
precision = 2
show_missing = true
skip_covered = false
fail_under = 80

[tool.coverage.html]
directory = "htmlcov"

[tool.coverage.xml]
output = "coverage.xml"

[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
Empty file added tests/__init__.py
Empty file.
176 changes: 176 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
"""Shared pytest fixtures and configuration for all tests."""

import os
import shutil
import tempfile
from pathlib import Path
from typing import Generator, Dict, Any
from unittest.mock import Mock, MagicMock

import pytest


@pytest.fixture
def temp_dir() -> Generator[Path, None, None]:
"""Create a temporary directory for test usage.

Yields:
Path: Path to the temporary directory that will be cleaned up after the test.
"""
temp_path = Path(tempfile.mkdtemp())
try:
yield temp_path
finally:
if temp_path.exists():
shutil.rmtree(temp_path)


@pytest.fixture
def mock_config() -> Dict[str, Any]:
"""Provide a mock configuration dictionary for testing.

Returns:
Dict[str, Any]: A dictionary with common configuration values.
"""
return {
"project_name": "test_project",
"home_page": "https://example.com",
"domain_name": "example.com",
"number_of_threads": 4,
"queue_file": "queue.txt",
"crawled_file": "crawled.txt",
}


@pytest.fixture
def mock_queue() -> Mock:
"""Create a mock Queue object for testing threading components.

Returns:
Mock: A mock Queue object with common queue methods.
"""
mock_q = Mock()
mock_q.empty.return_value = False
mock_q.get.return_value = "https://example.com/test"
mock_q.put = Mock()
mock_q.task_done = Mock()
return mock_q


@pytest.fixture
def sample_html() -> str:
"""Provide sample HTML content for testing link parsing.

Returns:
str: Sample HTML content with various types of links.
"""
return """
<!DOCTYPE html>
<html>
<head>
<title>Test Page</title>
</head>
<body>
<a href="https://example.com/page1">External Link</a>
<a href="/page2">Relative Link</a>
<a href="page3.html">Simple Link</a>
<a href="mailto:test@example.com">Email Link</a>
<a href="#section">Anchor Link</a>
<a href="https://other-domain.com">Other Domain</a>
</body>
</html>
"""


@pytest.fixture
def test_files(temp_dir: Path) -> Dict[str, Path]:
"""Create test files in the temporary directory.

Args:
temp_dir: The temporary directory fixture.

Returns:
Dict[str, Path]: Dictionary mapping file names to their paths.
"""
files = {}

# Create queue.txt
queue_file = temp_dir / "queue.txt"
queue_file.write_text("https://example.com/page1\nhttps://example.com/page2\n")
files["queue"] = queue_file

# Create crawled.txt
crawled_file = temp_dir / "crawled.txt"
crawled_file.write_text("https://example.com\n")
files["crawled"] = crawled_file

return files


@pytest.fixture
def mock_spider_class() -> type:
"""Create a mock Spider class for testing.

Returns:
type: A mock Spider class with basic methods.
"""
class MockSpider:
def __init__(self, thread_id, domain_name, boot_url):
self.thread_id = thread_id
self.domain_name = domain_name
self.boot_url = boot_url

self.crawl_page = Mock()
self.gather_links = Mock(return_value={"https://example.com/new"})
self.add_links_to_queue = Mock()
self.update_files = Mock()

return MockSpider


@pytest.fixture(autouse=True)
def reset_singletons():
"""Reset any singleton instances between tests.

This fixture runs automatically before each test to ensure
clean state for singleton patterns.
"""
# Add any singleton reset logic here if needed
yield
# Cleanup after test if needed


@pytest.fixture
def mock_url_response():
"""Mock URL response for testing web crawling functionality.

Returns:
Mock: A mock response object with common HTTP response attributes.
"""
mock_response = Mock()
mock_response.status_code = 200
mock_response.text = "<html><body><a href='/test'>Test Link</a></body></html>"
mock_response.headers = {"Content-Type": "text/html"}
mock_response.url = "https://example.com"
return mock_response


@pytest.fixture
def capture_logs():
"""Capture log output during tests.

Yields:
list: A list that will contain all log records during the test.
"""
import logging

log_records = []
handler = logging.Handler()
handler.emit = lambda record: log_records.append(record)

root_logger = logging.getLogger()
root_logger.addHandler(handler)

yield log_records

root_logger.removeHandler(handler)
Empty file added tests/integration/__init__.py
Empty file.
Loading