diff --git a/.DS_Store b/.DS_Store index 58839f9..4324c2f 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/Dockerfile b/Dockerfile index 36919c0..3c5e4a7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,4 +19,5 @@ EXPOSE 8080 # Run the API Gateway. # Cloud Run will set PORT, so we use that environment variable. -CMD ["python", "backend/api_gateway/api_gateway.py"] \ No newline at end of file +# CMD ["python", "backend/api_gateway/api_gateway.py"] +CMD ["./start-services.sh"] \ No newline at end of file diff --git a/backend/api_gateway/api_gateway.py b/backend/api_gateway/api_gateway.py index 60a678c..a2bbfb6 100644 --- a/backend/api_gateway/api_gateway.py +++ b/backend/api_gateway/api_gateway.py @@ -39,12 +39,9 @@ # Add project root to Python path for relative imports sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) -print("[DEBUG] [api_gateway] [startup] API Gateway starting up...") - # Load environment variables from .env file from dotenv import load_dotenv load_dotenv() -print("[DEBUG] [api_gateway] [startup] Environment variables loaded") # Import microservices and utilities from backend.microservices.summarization_service import run_summarization, process_articles @@ -54,805 +51,70 @@ from backend.microservices.auth_service import load_users from backend.microservices.news_storage import store_article_in_supabase, log_user_search, add_bookmark, get_user_bookmarks, delete_bookmark from backend.microservices.story_tracking_service import get_tracked_stories, create_tracked_story, get_story_details, delete_tracked_story - +from backend.api_gateway.utils.auth import token_required # Initialize logger for the API Gateway logger = setup_logger(__name__) -print("[DEBUG] [api_gateway] [startup] Logger initialized") +logger.info("API Gateway starting up...") # Initialize Flask application with security configurations app = Flask(__name__) app.config['SECRET_KEY'] = os.getenv('JWT_SECRET_KEY', 'your-secret-key') # JWT secret key for token signing -print("[DEBUG] [api_gateway] [startup] Flask app initialized with secret key") +logger.info("Flask app initialized with security configurations") # Configure CORS to allow specific origins and methods +allowed_origins = ["http://localhost:5173", "http://localhost:8080"] CORS(app, - origins=["http://localhost:5173", "http://localhost:5001"], + origins=allowed_origins, supports_credentials=True, allow_headers=["Content-Type", "Authorization"], methods=["GET", "POST", "PUT", "DELETE", "OPTIONS"]) -print("[DEBUG] [api_gateway] [startup] CORS configured") +logger.info(f"CORS configured with allowed origins: {allowed_origins}") # Initialize Flask-RestX for API documentation api = Api(app, version='1.0', title='News Aggregator API', description='A news aggregation and summarization API') -print("[DEBUG] [api_gateway] [startup] Flask-RestX API initialized") - -# Define API namespaces for logical grouping of endpoints -news_ns = api.namespace('api/news', description='News operations') -health_ns = api.namespace('health', description='Health check operations') -summarize_ns = api.namespace('summarize', description='Text summarization operations') -user_ns = api.namespace('api/user', description='User operations') -auth_ns = api.namespace('api/auth', description='Authentication operations') -bookmark_ns = api.namespace('api/bookmarks', description='Bookmark operations') -story_tracking_ns = api.namespace('api/story_tracking', description='Story tracking operations') -print("[DEBUG] [api_gateway] [startup] API namespaces defined") - -def token_required(f): - """Decorator to protect routes that require authentication. - - This decorator validates the JWT token in the Authorization header. - It ensures that only authenticated users can access protected endpoints. +logger.info("Flask-RestX API initialized with documentation support") + +# Import namespaces from route modules +try: + from backend.api_gateway.routes.news import news_ns + from backend.api_gateway.routes.auth import auth_ns + from backend.api_gateway.routes.health import health_ns + from backend.api_gateway.routes.summarize import summarize_ns + from backend.api_gateway.routes.user import user_ns + from backend.api_gateway.routes.bookmark import bookmark_ns + from backend.api_gateway.routes.story_tracking import story_tracking_ns - Args: - f: The function to be decorated. - - Returns: - decorated: The decorated function that includes token validation. - - Raises: - 401: If the token is missing or invalid. - """ - @wraps(f) - def decorated(*args, **kwargs): - print("[DEBUG] [api_gateway] [token_required] Checking token in request") - auth_header = request.headers.get('Authorization') - if not auth_header: - print("[DEBUG] [api_gateway] [token_required] Authorization header missing") - return {'error': 'Authorization header missing'}, 401 - try: - token = auth_header.split()[1] # Extract token from 'Bearer ' - print(f"[DEBUG] [api_gateway] [token_required] Decoding token: {token[:10]}...") - payload = jwt.decode(token, app.config['SECRET_KEY'], algorithms=['HS256'],audience='authenticated') - print(f"[DEBUG] [api_gateway] [token_required] Token decoded successfully, user: {payload.get('sub', 'unknown')}") - - return f(*args, **kwargs) - except Exception as e: - print(f"[DEBUG] [api_gateway] [token_required] Token validation error: {str(e)}") - return {'error': 'Invalid token', 'message': str(e)}, 401 - return decorated + # Register imported namespaces with the API + api.add_namespace(news_ns) + api.add_namespace(auth_ns) + api.add_namespace(health_ns) + api.add_namespace(summarize_ns) + api.add_namespace(user_ns) + api.add_namespace(bookmark_ns) + api.add_namespace(story_tracking_ns) + logger.info("All API namespaces successfully registered") +except Exception as e: + logger.error(f"Error loading API namespaces: {str(e)}") + raise + +# token_required decorator is now in utils/auth.py # Define API models for request/response documentation -article_model = api.model('Article', { - 'article_text': fields.String(required=True, description='The text to summarize') -}) - -user_profile_model = api.model('UserProfile', { - 'id': fields.String(description='User ID'), - 'username': fields.String(description='Username'), - 'email': fields.String(description='Email address'), - 'firstName': fields.String(description='First name'), - 'lastName': fields.String(description='Last name'), - 'avatarUrl': fields.String(description='URL to user avatar') -}) - -# Model for user registration -signup_model = api.model('Signup', { - 'username': fields.String(required=True, description='Username'), - 'password': fields.String(required=True, description='Password'), - 'email': fields.String(required=True, description='Email address'), - 'firstName': fields.String(required=False, description='First name'), - 'lastName': fields.String(required=False, description='Last name') -}) - -print("[DEBUG] [api_gateway] [startup] API models defined") - -# Health check endpoint for system monitoring -@health_ns.route('/') -class HealthCheck(Resource): - def get(self): - """Check the health status of the API Gateway. - - Returns: - dict: A dictionary containing the health status. - int: HTTP 200 status code indicating success. - """ - print("[DEBUG] [api_gateway] [health_check] Called") - return {"status": "API Gateway is healthy"}, 200 - -# Endpoint for article summarization -@summarize_ns.route('/') -class Summarize(Resource): - @summarize_ns.expect(article_model) - def post(self): - """Summarize the provided article text. - - Expects a JSON payload with 'article_text' field. - Uses the summarization service to generate a concise summary. - - Returns: - dict: Contains the generated summary. - int: HTTP 200 status code on success. - """ - print("[DEBUG] [api_gateway] [summarize] Called") - data = request.get_json() - article_text = data.get('article_text', '') - print(f"[DEBUG] [api_gateway] [summarize] Summarizing text of length: {len(article_text)}") - summary = run_summarization(article_text) - print(f"[DEBUG] [api_gateway] [summarize] Summarization complete, summary length: {len(summary)}") - return {"summary": summary}, 200 - -@news_ns.route('/fetch') -class NewsFetch(Resource): - @news_ns.param('keyword', 'Search keyword for news') - @news_ns.param('user_id', 'User ID for logging search history') - @news_ns.param('session_id', 'Session ID for tracking requests') - def get(self): - """Fetch news articles based on a keyword and store them in Supabase. - - This endpoint fetches news articles matching the provided keyword, - stores them in Supabase, and logs the search history if a user ID is provided. - - Args: - keyword (str): The search term for fetching news articles. - user_id (str, optional): User ID for logging search history. - session_id (str): Session ID for tracking the request. - - Returns: - dict: Contains the stored article IDs and success status. - int: HTTP 200 on success, 500 on error. - """ - try: - keyword = request.args.get('keyword', '') - user_id = request.args.get('user_id') # optional - session_id = request.args.get('session_id') - print(f"[DEBUG] [api_gateway] [news_fetch] Called with keyword: '{keyword}', user_id: {user_id}, session_id: {session_id}") - - print(f"[DEBUG] [api_gateway] [news_fetch] Fetching news articles for keyword: '{keyword}'") - articles = fetch_news(keyword) # This returns a list of articles. - print(f"[DEBUG] [api_gateway] [news_fetch] Found {len(articles) if articles else 0} articles") - stored_article_ids = [] - - for article in articles: - print(f"[DEBUG] [api_gateway] [news_fetch] Storing article: {article.get('title', 'No title')}") - article_id = store_article_in_supabase(article) - stored_article_ids.append(article_id) - print(f"[DEBUG] [api_gateway] [news_fetch] Stored article with ID: {article_id}") - - if user_id: - print(f"[DEBUG] [api_gateway] [news_fetch] Logging search for user {user_id}, article {article_id}") - log_user_search(user_id, article_id, session_id) - - print(f"[DEBUG] [api_gateway] [news_fetch] Returning {len(stored_article_ids)} article IDs") - return make_response(jsonify({ - 'status': 'success', - 'data': stored_article_ids - }), 200) - - except Exception as e: - print(f"[DEBUG] [api_gateway] [news_fetch] Error: {str(e)}") - return make_response(jsonify({ - 'status': 'error', - 'message': str(e) - }), 500) - -@news_ns.route('/process') -class NewsProcess(Resource): - @news_ns.param('session_id', 'Session ID for tracking requests (optional)') - def post(self): - """Process and summarize a batch of articles. - - This endpoint processes articles based on the provided article IDs in the request body, - generating summaries and checking bookmark status for the user if authenticated. - - Returns: - dict: Contains processed articles data and success status. - int: HTTP 200 on success, 500 on error. - """ - try: - session_id = request.args.get('session_id') - - # Try to get user_id from JWT token if it exists - user_id = None - auth_header = request.headers.get('Authorization') - if auth_header: - try: - token = auth_header.split()[1] # Extract token from 'Bearer ' - payload = jwt.decode(token, app.config['SECRET_KEY'], algorithms=['HS256'], audience='authenticated') - user_id = payload.get('sub') - print(f"[DEBUG] [api_gateway] [news_process] Extracted user_id from token: {user_id}") - except Exception as e: - print(f"[DEBUG] [api_gateway] [news_process] Could not extract user_id from token: {str(e)}") - - print(f"[DEBUG] [api_gateway] [news_process] Called with session_id: {session_id}, user_id: {user_id}") - - # Get article_ids from request body - request_data = request.get_json() - article_ids = request_data.get('article_ids', []) - - print(f"[DEBUG] [api_gateway] [news_process] Article IDs from request: {article_ids}") - - if not article_ids: - return { - 'status': 'error', - 'message': 'No article IDs provided in request body' - }, 400 - - print("[DEBUG] [api_gateway] [news_process] Processing articles...") - summarized_articles = process_articles(article_ids, user_id) - print(f"[DEBUG] [api_gateway] [news_process] Processed {len(summarized_articles) if summarized_articles else 0} articles") - - return { - 'status': 'success', - 'message': 'Articles processed and summarized successfully', - 'data': summarized_articles, - 'session_id': session_id - }, 200 - - except Exception as e: - print(f"[DEBUG] [api_gateway] [news_process] Error: {str(e)}") - logger.error(f"Error processing articles: {str(e)}") - return { - 'status': 'error', - 'message': str(e) - }, 500 - -@auth_ns.route('/signup') -class Signup(Resource): - @auth_ns.expect(signup_model) - def post(self): - """Register a new user in the system. - - Creates a new user account with the provided information and generates - a JWT token for immediate authentication. - - Expected JSON payload: - { - 'username': str (required), - 'password': str (required), - 'email': str (required), - 'firstName': str (optional), - 'lastName': str (optional) - } - - Returns: - dict: Contains user data (excluding password) and JWT token. - int: HTTP 201 on success, 400 on validation error, 500 on server error. - """ - print("[DEBUG] [api_gateway] [signup] User signup endpoint called") - data = request.get_json() - username = data.get('username') - password = data.get('password') - email = data.get('email') - firstName = data.get('firstName', '') - lastName = data.get('lastName', '') - print(f"[DEBUG] [api_gateway] [signup] Request for username: {username}, email: {email}") - - if not username or not password or not email: - print("[DEBUG] [api_gateway] [signup] Validation failed: missing required fields") - return {'error': 'Username, password, and email are required'}, 400 - - users = load_users() - print(f"[DEBUG] [api_gateway] [signup] Loaded {len(users)} existing users") - - # Check if username already exists - if any(u.get('username') == username for u in users): - print(f"[DEBUG] [api_gateway] [signup] Username {username} already exists") - return {'error': 'Username already exists'}, 400 - - # Create new user with unique ID - new_user = { - 'id': str(uuid.uuid4()), - 'username': username, - 'password': password, - 'email': email, - 'firstName': firstName, - 'lastName': lastName - } - print(f"[DEBUG] [api_gateway] [signup] Created new user with ID: {new_user['id']}") - - users.append(new_user) - - try: - # Save updated users list - print("[DEBUG] [api_gateway] [signup] Saving updated users list") - with open(os.path.join(os.path.dirname(os.path.dirname(__file__)), 'data', 'users.txt'), 'w') as f: - json.dump(users, f, indent=4) - print("[DEBUG] [api_gateway] [signup] Users list saved successfully") - except Exception as e: - print(f"[DEBUG] [api_gateway] [signup] Error saving user data: {str(e)}") - return {'error': 'Failed to save user data', 'message': str(e)}, 500 - - # Generate JWT token - print("[DEBUG] [api_gateway] [signup] Generating JWT token") - token = jwt.encode({ - 'id': new_user['id'], - 'username': new_user['username'], - 'exp': datetime.datetime.utcnow() + datetime.timedelta(hours=1) - }, app.config['SECRET_KEY'], algorithm='HS256') - print(f"[DEBUG] [api_gateway] [signup] Token generated: {token[:10]}...") - - # Exclude password from response - user_data = {k: new_user[k] for k in new_user if k != 'password'} - print("[DEBUG] [api_gateway] [signup] Signup successful") - return {'message': 'User registered successfully', 'user': user_data, 'token': token}, 201 - -@auth_ns.route('/login') -class Login(Resource): - def post(self): - """Authenticate user and generate JWT token. - - Validates user credentials and generates a JWT token for authenticated access. - - Expected JSON payload: - { - 'username': str (required), - 'password': str (required) - } - - Returns: - dict: Contains user data (excluding password) and JWT token. - int: HTTP 200 on success, 400 on validation error, 401 on invalid credentials. - """ - print("[DEBUG] [api_gateway] [login] Login endpoint called") - data = request.get_json() - username = data.get('username') - password = data.get('password') - print(f"[DEBUG] [api_gateway] [login] Login attempt for username: {username}") - - if not username or not password: - print("[DEBUG] [api_gateway] [login] Validation failed: missing username or password") - return {'error': 'Username and password are required'}, 400 - - users = load_users() - print(f"[DEBUG] [api_gateway] [login] Loaded {len(users)} users") - user = next((u for u in users if u.get('username') == username and u.get('password') == password), None) - - if not user: - print(f"[DEBUG] [api_gateway] [login] Invalid credentials for username: {username}") - return {'error': 'Invalid credentials'}, 401 - - print(f"[DEBUG] [api_gateway] [login] Valid credentials for user: {user.get('id')}") - print("[DEBUG] [api_gateway] [login] Generating JWT token") - token = jwt.encode({ - 'id': user['id'], - 'username': user['username'], - 'exp': datetime.datetime.utcnow() + datetime.timedelta(hours=1) - }, app.config['SECRET_KEY'], algorithm='HS256') - print(f"[DEBUG] [api_gateway] [login] Token generated: {token[:10]}...") - - user_data = {k: user[k] for k in user if k != 'password'} - print("[DEBUG] [api_gateway] [login] Login successful") - return {'token': token, 'user': user_data} - -@user_ns.route('/profile') -class UserProfile(Resource): - @token_required - @user_ns.marshal_with(user_profile_model) - def get(self): - """Retrieve authenticated user's profile information. - - Requires a valid JWT token in the Authorization header. - Returns the user's profile data excluding sensitive information. - - Returns: - dict: User profile data including id, username, email, and names. - int: HTTP 200 on success, 404 if user not found. - """ - print("[DEBUG] [api_gateway] [user_profile] Called") - auth_header = request.headers.get('Authorization') - token = auth_header.split()[1] - print(f"[DEBUG] [api_gateway] [user_profile] Decoding token: {token[:10]}...") - payload = jwt.decode(token, app.config['SECRET_KEY'], algorithms=['HS256']) - print(f"[DEBUG] [api_gateway] [user_profile] Looking up user with ID: {payload.get('id')}") - - users = load_users() - user = next((u for u in users if u.get('id') == payload.get('id')), None) - if not user: - print(f"[DEBUG] [api_gateway] [user_profile] User not found with ID: {payload.get('id')}") - return {'error': 'User not found'}, 404 - - print(f"[DEBUG] [api_gateway] [user_profile] Found user: {user.get('username')}") - return {k: user[k] for k in user if k != 'password'}, 200 - -@bookmark_ns.route('/') -class Bookmark(Resource): - @token_required - def get(self): - """Retrieve all bookmarks for the authenticated user. - - Requires a valid JWT token in the Authorization header. - Returns a list of bookmarked articles for the current user. - - Returns: - dict: Contains list of bookmarked articles and success status. - int: HTTP 200 on success, 500 on error. - """ - try: - print("[DEBUG] [api_gateway] [get_bookmarks] Called") - auth_header = request.headers.get('Authorization') - token = auth_header.split()[1] - print(f"[DEBUG] [api_gateway] [get_bookmarks] Decoding token: {token[:10]}...") - payload = jwt.decode(token, app.config['SECRET_KEY'], algorithms=['HS256'],audience='authenticated') - user_id = payload.get('sub') - print(f"[DEBUG] [api_gateway] [get_bookmarks] Getting bookmarks for user: {user_id}") - - bookmarks = get_user_bookmarks(user_id) - print(f"[DEBUG] [api_gateway] [get_bookmarks] Found {len(bookmarks)} bookmarks") - - return { - 'status': 'success', - 'data': bookmarks - }, 200 - - except Exception as e: - print(f"[DEBUG] [api_gateway] [get_bookmarks] Error: {str(e)}") - logger.error(f"Error fetching bookmarks: {str(e)}") - return { - 'status': 'error', - 'message': str(e) - }, 500 - - @token_required - def post(self): - """Add a new bookmark for the authenticated user. - - Requires a valid JWT token in the Authorization header. - Creates a bookmark linking the user to a specific news article. - - Expected JSON payload: - { - 'news_id': str (required) - } - - Returns: - dict: Contains bookmark ID and success status. - int: HTTP 201 on success, 400 on validation error, 500 on server error. - """ - try: - print("[DEBUG] [api_gateway] [add_bookmark] Called") - auth_header = request.headers.get('Authorization') - token = auth_header.split()[1] - print(f"[DEBUG] [api_gateway] [add_bookmark] Decoding token: {token[:10]}...") - payload = jwt.decode(token, app.config['SECRET_KEY'], algorithms=['HS256'],audience='authenticated') - user_id = payload.get('sub') - print(f"[DEBUG] [api_gateway] [add_bookmark] Adding bookmark for user: {user_id}") - - data = request.get_json() - news_id = data.get('news_id') - print(f"[DEBUG] [api_gateway] [add_bookmark] News article ID: {news_id}") - - if not news_id: - print("[DEBUG] [api_gateway] [add_bookmark] News article ID missing in request") - return {'error': 'News article ID is required'}, 400 - - print(f"[DEBUG] [api_gateway] [add_bookmark] Adding bookmark for user {user_id}, article {news_id}") - bookmark = add_bookmark(user_id, news_id) - print(f"[DEBUG] [api_gateway] [add_bookmark] Bookmark added with ID: {bookmark['id'] if isinstance(bookmark, dict) else bookmark}") - - return { - 'status': 'success', - 'message': 'Bookmark added successfully', - 'data': { - 'bookmark_id': bookmark['id'] if isinstance(bookmark, dict) else bookmark - } - }, 201 - - except Exception as e: - print(f"[DEBUG] [api_gateway] [add_bookmark] Error: {str(e)}") - logger.error(f"Error adding bookmark: {str(e)}") - return { - 'status': 'error', - 'message': str(e) - }, 500 - -@bookmark_ns.route('/') -class BookmarkDelete(Resource): - @token_required - def delete(self, bookmark_id): - """Remove a bookmark for a news article. - - Requires a valid JWT token in the Authorization header. - Deletes the specified bookmark for the authenticated user. - - Args: - bookmark_id (str): The ID of the bookmark to be deleted. - - Returns: - dict: Contains success message. - int: HTTP 200 on success, 500 on error. - """ - try: - print(f"[DEBUG] [api_gateway] [delete_bookmark] Called for bookmark: {bookmark_id}") - auth_header = request.headers.get('Authorization') - token = auth_header.split()[1] - print(f"[DEBUG] [api_gateway] [delete_bookmark] Decoding token: {token[:10]}...") - payload = jwt.decode(token, app.config['SECRET_KEY'], algorithms=['HS256'],audience='authenticated') - user_id = payload.get('sub') - print(f"[DEBUG] [api_gateway] [delete_bookmark] Deleting bookmark {bookmark_id} for user {user_id}") - - result = delete_bookmark(user_id, bookmark_id) - print(f"[DEBUG] [api_gateway] [delete_bookmark] Deletion result: {result}") - - return { - 'status': 'success', - 'message': 'Bookmark removed successfully' - }, 200 - - except Exception as e: - print(f"[DEBUG] [api_gateway] [delete_bookmark] Error: {str(e)}") - logger.error(f"Error removing bookmark: {str(e)}") - return { - 'status': 'error', - 'message': str(e) - }, 500 - -@story_tracking_ns.route('/') -class StoryTracking(Resource): - @story_tracking_ns.param('keyword', 'Keyword to track for news updates') - def get(self): - """Fetch latest news for a tracked keyword. - - Retrieves and processes the latest news articles for a given keyword. - - Args: - keyword (str): The keyword to search for news articles. - - Returns: - dict: Contains list of processed articles and success status. - int: HTTP 200 on success, 400 if keyword is missing, 500 on error. - """ - try: - print("[DEBUG] [api_gateway] [story_tracking] Story tracking get endpoint called") - keyword = request.args.get('keyword') - print(f"[DEBUG] [api_gateway] [story_tracking] Requested keyword: '{keyword}'") - if not keyword: - print("[DEBUG] [api_gateway] [story_tracking] Keyword parameter missing") - return make_response(jsonify({ - 'status': 'error', - 'message': 'Keyword parameter is required' - }), 400) - - print(f"[DEBUG] [api_gateway] [story_tracking] Fetching news for keyword: '{keyword}'") - articles = fetch_news(keyword) - print(f"[DEBUG] [api_gateway] [story_tracking] Found {len(articles) if articles else 0} articles") - - processed_articles = [] - for article in articles: - print(f"[DEBUG] [api_gateway] [story_tracking] Processing article: {article.get('title', 'No title')}") - article_id = store_article_in_supabase(article) - print(f"[DEBUG] [api_gateway] [story_tracking] Stored article with ID: {article_id}") - processed_articles.append({ - 'id': article_id, - 'title': article.get('title'), - 'url': article.get('url'), - 'source': article.get('source', {}).get('name') if isinstance(article.get('source'), dict) else article.get('source'), - 'publishedAt': article.get('publishedAt', datetime.now().isoformat()) - }) - - print(f"[DEBUG] [api_gateway] [story_tracking] Returning {len(processed_articles)} processed articles") - return make_response(jsonify({ - 'status': 'success', - 'articles': processed_articles - }), 200) - - except Exception as e: - print(f"[DEBUG] [api_gateway] [story_tracking] Error: {str(e)}") - logger.error(f"Error in story tracking: {str(e)}") - return make_response(jsonify({ - 'status': 'error', - 'message': str(e) - }), 500) - - @token_required - def post(self): - """Create a new tracked story. - - Requires a valid JWT token in the Authorization header. - Creates a new tracked story for the authenticated user based on a keyword and source article. - - Expected JSON payload: - { - 'keyword': str (required), - 'sourceArticleId': str (optional) - } - - Returns: - dict: Contains created story details and success status. - int: HTTP 201 on success, 400 on validation error, 500 on server error. - """ - try: - print("[DEBUG] [api_gateway] [story_tracking] Called") - auth_header = request.headers.get('Authorization') - token = auth_header.split()[1] - print(f"[DEBUG] [api_gateway] [story_tracking] Decoding token: {token[:10]}...") - payload = jwt.decode(token, app.config['SECRET_KEY'], algorithms=['HS256'], audience='authenticated') - user_id = payload.get('sub') - print(f"[DEBUG] [api_gateway] [story_tracking] Creating tracked story for user: {user_id}") - - data = request.get_json() - keyword = data.get('keyword') - source_article_id = data.get('sourceArticleId') - print(f"[DEBUG] [api_gateway] [story_tracking] Story details - Keyword: '{keyword}', Source article: {source_article_id}") - - if not keyword: - print("[DEBUG] [api_gateway] [story_tracking] Keyword parameter missing in request") - return make_response(jsonify({ - 'status': 'error', - 'message': 'Keyword is required' - }), 400) - - print(f"[DEBUG] [api_gateway] [story_tracking] Calling create_tracked_story with user_id: {user_id}, keyword: '{keyword}'") - tracked_story = create_tracked_story(user_id, keyword, source_article_id) - print(f"[DEBUG] [api_gateway] [story_tracking] Tracked story created with ID: {tracked_story['id'] if tracked_story else 'unknown'}") - - print(f"[DEBUG] [api_gateway] [story_tracking] Getting full story details for story: {tracked_story['id']}") - story_with_articles = get_story_details(tracked_story['id']) - print(f"[DEBUG] [api_gateway] [story_tracking] Found {len(story_with_articles.get('articles', [])) if story_with_articles else 0} related articles") - - return make_response(jsonify({ - 'status': 'success', - 'data': story_with_articles - }), 201) - - except Exception as e: - print(f"[DEBUG] [api_gateway] [story_tracking] Error: {str(e)}") - logger.error(f"Error creating tracked story: {str(e)}") - return make_response(jsonify({ - 'status': 'error', - 'message': str(e) - }), 500) - -@story_tracking_ns.route('/user') -class UserStoryTracking(Resource): - @token_required - def get(self): - """Get all tracked stories for the authenticated user. - - Requires a valid JWT token in the Authorization header. - Retrieves all tracked stories associated with the authenticated user. - - Returns: - dict: Contains list of tracked stories and success status. - int: HTTP 200 on success, 500 on error. - """ - try: - print("[DEBUG] [api_gateway] [user_story_tracking] Called") - auth_header = request.headers.get('Authorization') - token = auth_header.split()[1] - print(f"[DEBUG] [api_gateway] [user_story_tracking] Decoding token: {token[:10]}...") - payload = jwt.decode(token, app.config['SECRET_KEY'], algorithms=['HS256'], audience='authenticated') - user_id = payload.get('sub') - print(f"[DEBUG] [api_gateway] [user_story_tracking] Getting tracked stories for user: {user_id}") - - print(f"[DEBUG] [api_gateway] [user_story_tracking] Calling get_tracked_stories") - tracked_stories = get_tracked_stories(user_id) - print(f"[DEBUG] [api_gateway] [user_story_tracking] Found {len(tracked_stories)} tracked stories") - - return make_response(jsonify({ - 'status': 'success', - 'data': tracked_stories - }), 200) - - except Exception as e: - print(f"[DEBUG] [api_gateway] [user_story_tracking] Error: {str(e)}") - logger.error(f"Error getting tracked stories: {str(e)}") - return make_response(jsonify({ - 'status': 'error', - 'message': str(e) - }), 500) - -@story_tracking_ns.route('/') -class StoryTrackingDetail(Resource): - @token_required - def get(self, story_id): - """Get details for a specific tracked story. - - Requires a valid JWT token in the Authorization header. - Retrieves detailed information about a specific tracked story. - - Args: - story_id (str): The ID of the tracked story to retrieve. - - Returns: - dict: Contains story details and success status. - int: HTTP 200 on success, 404 if story not found, 500 on error. - """ - try: - print(f"[DEBUG] [api_gateway] [story_tracking_detail] Called for story: {story_id}") - print(f"[DEBUG] [api_gateway] [story_tracking_detail] Calling get_story_details for story: {story_id}") - story = get_story_details(story_id) - - if not story: - print(f"[DEBUG] [api_gateway] [story_tracking_detail] No story found with ID: {story_id}") - return make_response(jsonify({ - 'status': 'error', - 'message': 'Tracked story not found' - }), 404) - - print(f"[DEBUG] [api_gateway] [story_tracking_detail] Found story: {story['keyword']}") - print(f"[DEBUG] [api_gateway] [story_tracking_detail] Story has {len(story.get('articles', []))} articles") - return make_response(jsonify({ - 'status': 'success', - 'data': story - }), 200) - - except Exception as e: - print(f"[DEBUG] [api_gateway] [story_tracking_detail] Error: {str(e)}") - logger.error(f"Error getting story details: {str(e)}") - return make_response(jsonify({ - 'status': 'error', - 'message': str(e) - }), 500) - - @token_required - def delete(self, story_id): - """Stop tracking a story. - - Requires a valid JWT token in the Authorization header. - Deletes a tracked story for the authenticated user. - - Args: - story_id (str): The ID of the tracked story to delete. - - Returns: - dict: Contains success message. - int: HTTP 200 on success, 404 if story not found, 500 on error. - """ - try: - print(f"[DEBUG] [api_gateway] [delete_story_tracking] Called for story: {story_id}") - auth_header = request.headers.get('Authorization') - token = auth_header.split()[1] - print(f"[DEBUG] [api_gateway] [delete_story_tracking] Decoding token: {token[:10]}...") - payload = jwt.decode(token, app.config['SECRET_KEY'], algorithms=['HS256'], audience='authenticated') - user_id = payload.get('sub') - print(f"[DEBUG] [api_gateway] [delete_story_tracking] Deleting tracked story {story_id} for user {user_id}") - - print(f"[DEBUG] [api_gateway] [delete_story_tracking] Calling delete_tracked_story") - success = delete_tracked_story(user_id, story_id) - print(f"[DEBUG] [api_gateway] [delete_story_tracking] Delete result: {success}") - - if not success: - print(f"[DEBUG] [api_gateway] [delete_story_tracking] Failed to delete story or story not found") - return make_response(jsonify({ - 'status': 'error', - 'message': 'Failed to delete tracked story or story not found' - }), 404) - - print(f"[DEBUG] [api_gateway] [delete_story_tracking] Story deleted successfully") - return make_response(jsonify({ - 'status': 'success', - 'message': 'Tracked story deleted successfully' - }), 200) - - except Exception as e: - print(f"[DEBUG] [api_gateway] [delete_story_tracking] Error: {str(e)}") - logger.error(f"Error deleting tracked story: {str(e)}") - return make_response(jsonify({ - 'status': 'error', - 'message': str(e) - }), 500) -@app.route('/api/story_tracking', methods=['OPTIONS']) -def story_tracking_options(): - """Handle OPTIONS requests for the story tracking endpoint. +# User profile model is now defined in routes/user.py - This function sets the necessary CORS headers for preflight requests - to the story tracking endpoint. +# API models for other endpoints are defined in their respective modules - Returns: - Response: A Flask response object with appropriate CORS headers. - """ - print("[DEBUG] [api_gateway] [story_tracking_options] Called") - response = make_response() - response.headers.add("Access-Control-Allow-Origin", "*") - response.headers.add("Access-Control-Allow-Headers", "Content-Type,Authorization") - response.headers.add("Access-Control-Allow-Methods", "GET,POST,PUT,DELETE,OPTIONS") - print("[DEBUG] [api_gateway] [story_tracking_options] Responding with CORS headers") - return response +logger.info("API Gateway initialization completed successfully") if __name__ == '__main__': - # Read the port from the environment (Cloud Run sets the PORT variable) - port = int(os.environ.get("PORT", 8080)) - print(f"Starting server on port {port}") - app.run(host="0.0.0.0", port=port) + try: + # Read the port from the environment (Cloud Run sets the PORT variable) + port = int(os.environ.get("PORT", 8080)) + logger.info(f"Starting server on port {port}") + app.run(host="0.0.0.0", port=port) + except Exception as e: + logger.critical(f"Failed to start server: {str(e)}") + sys.exit(1) diff --git a/backend/api_gateway/routes/auth.py b/backend/api_gateway/routes/auth.py new file mode 100644 index 0000000..22bbd15 --- /dev/null +++ b/backend/api_gateway/routes/auth.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +""" +Authentication API Routes + +This module contains the API routes for authentication operations including signup, login, and token management. +""" + +# Standard library imports +from flask import jsonify, request, make_response +from flask_restx import Resource, Namespace, fields +import jwt +import uuid +import datetime +import os +import json +from functools import wraps + +# Import microservices and utilities +from backend.microservices.auth_service import load_users +from backend.core.utils import setup_logger + +# Initialize logger +logger = setup_logger(__name__) + +# Create auth namespace +auth_ns = Namespace('api/auth', description='Authentication operations') + +# Define API models for request/response documentation +signup_model = auth_ns.model('Signup', { + 'username': fields.String(required=True, description='Username'), + 'password': fields.String(required=True, description='Password'), + 'email': fields.String(required=True, description='Email address'), + 'firstName': fields.String(required=False, description='First name'), + 'lastName': fields.String(required=False, description='Last name') +}) + +@auth_ns.route('/signup') +class Signup(Resource): + @auth_ns.expect(signup_model) + def post(self): + """Register a new user in the system. + + Creates a new user account with the provided information and generates + a JWT token for immediate authentication. + + Expected JSON payload: + { + 'username': str (required), + 'password': str (required), + 'email': str (required), + 'firstName': str (optional), + 'lastName': str (optional) + } + + Returns: + dict: Contains user data (excluding password) and JWT token. + int: HTTP 201 on success, 400 on validation error, 500 on server error. + """ + logger.info("User signup endpoint called") + data = request.get_json() + username = data.get('username') + password = data.get('password') + email = data.get('email') + firstName = data.get('firstName', '') + lastName = data.get('lastName', '') + logger.info(f"Signup request for username: {username}, email: {email}") + + if not username or not password or not email: + logger.warning("Signup validation failed: missing required fields") + return {'error': 'Username, password, and email are required'}, 400 + + users = load_users() + logger.debug(f"Loaded {len(users)} existing users") + + # Check if username already exists + if any(u.get('username') == username for u in users): + logger.warning(f"Signup failed: Username {username} already exists") + return {'error': 'Username already exists'}, 400 + + # Create new user with unique ID + new_user = { + 'id': str(uuid.uuid4()), + 'username': username, + 'password': password, + 'email': email, + 'firstName': firstName, + 'lastName': lastName + } + logger.debug(f"Created new user with ID: {new_user['id']}") + + users.append(new_user) + + try: + # Save updated users list + logger.debug("Saving updated users list") + with open(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'data', 'users.txt'), 'w') as f: + json.dump(users, f, indent=4) + logger.debug("Users list saved successfully") + except Exception as e: + logger.error(f"Error saving user data: {str(e)}") + return {'error': 'Failed to save user data', 'message': str(e)}, 500 + + # Generate JWT token + logger.debug("Generating JWT token") + from flask import current_app + token = jwt.encode({ + 'sub': new_user['id'], + 'username': new_user['username'], + 'exp': datetime.datetime.utcnow() + datetime.timedelta(hours=1), + 'aud': 'authenticated' + }, current_app.config['SECRET_KEY'], algorithm='HS256') + logger.debug(f"Token generated: {token[:10]}...") + + # Exclude password from response + user_data = {k: new_user[k] for k in new_user if k != 'password'} + logger.info("Signup successful") + return {'message': 'User registered successfully', 'user': user_data, 'token': token}, 201 + +@auth_ns.route('/login') +class Login(Resource): + def post(self): + """Authenticate user and generate JWT token. + + Validates user credentials and generates a JWT token for authenticated access. + + Expected JSON payload: + { + 'username': str (required), + 'password': str (required) + } + + Returns: + dict: Contains user data (excluding password) and JWT token. + int: HTTP 200 on success, 400 on validation error, 401 on invalid credentials. + """ + logger.info("Login endpoint called") + data = request.get_json() + username = data.get('username') + password = data.get('password') + logger.info(f"Login attempt for username: {username}") + + if not username or not password: + logger.warning("Login validation failed: missing username or password") + return {'error': 'Username and password are required'}, 400 + + users = load_users() + logger.debug(f"Loaded {len(users)} users") + user = next((u for u in users if u.get('username') == username and u.get('password') == password), None) + + if not user: + logger.warning(f"Invalid credentials for username: {username}") + return {'error': 'Invalid credentials'}, 401 + + logger.debug(f"Valid credentials for user: {user.get('id')}") + logger.debug("Generating JWT token") + from flask import current_app + token = jwt.encode({ + 'sub': user['id'], + 'username': user['username'], + 'exp': datetime.datetime.utcnow() + datetime.timedelta(hours=1), + 'aud': 'authenticated' + }, current_app.config['SECRET_KEY'], algorithm='HS256') + logger.debug(f"Token generated: {token[:10]}...") + + user_data = {k: user[k] for k in user if k != 'password'} + logger.info("Login successful") + return {'token': token, 'user': user_data} \ No newline at end of file diff --git a/backend/api_gateway/routes/bookmark.py b/backend/api_gateway/routes/bookmark.py new file mode 100644 index 0000000..dc0969b --- /dev/null +++ b/backend/api_gateway/routes/bookmark.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +""" +Bookmark API Routes + +This module contains the API routes for bookmark operations including adding, listing, and deleting bookmarks. +""" + +# Standard library imports +from flask import jsonify, request, make_response +from flask_restx import Resource, Namespace +import jwt +from functools import wraps +from flask import current_app + +# Import microservices and utilities +from backend.microservices.news_storage import add_bookmark, get_user_bookmarks, delete_bookmark +from backend.core.utils import setup_logger + +# Initialize logger +logger = setup_logger(__name__) + +# Create bookmark namespace +bookmark_ns = Namespace('api/bookmarks', description='Bookmark operations') + +# Import token_required decorator from utils +from backend.api_gateway.utils.auth import token_required + +@bookmark_ns.route('/') +class Bookmark(Resource): + @token_required + def get(self): + """Retrieve all bookmarks for the authenticated user. + + Requires a valid JWT token in the Authorization header. + Returns a list of bookmarked articles for the current user. + + Returns: + dict: Contains list of bookmarked articles and success status. + int: HTTP 200 on success, 500 on error. + """ + try: + logger.info("Get bookmarks endpoint called") + auth_header = request.headers.get('Authorization') + token = auth_header.split()[1] + logger.debug(f"Decoding token: {token[:10]}...") + payload = jwt.decode(token, current_app.config['SECRET_KEY'], algorithms=['HS256'], audience='authenticated') + user_id = payload.get('sub') + logger.info(f"Getting bookmarks for user: {user_id}") + + bookmarks = get_user_bookmarks(user_id) + logger.debug(f"Found {len(bookmarks)} bookmarks") + + return { + 'status': 'success', + 'data': bookmarks + }, 200 + + except Exception as e: + logger.error(f"Error fetching bookmarks: {str(e)}") + return { + 'status': 'error', + 'message': str(e) + }, 500 + + @token_required + def post(self): + """Add a new bookmark for the authenticated user. + + Requires a valid JWT token in the Authorization header. + Creates a bookmark linking the user to a specific news article. + + Expected JSON payload: + { + 'news_id': str (required) + } + + Returns: + dict: Contains bookmark ID and success status. + int: HTTP 201 on success, 400 on validation error, 500 on server error. + """ + try: + logger.info("Add bookmark endpoint called") + auth_header = request.headers.get('Authorization') + token = auth_header.split()[1] + logger.debug(f"Decoding token: {token[:10]}...") + payload = jwt.decode(token, current_app.config['SECRET_KEY'], algorithms=['HS256'], audience='authenticated') + user_id = payload.get('sub') + logger.info(f"Adding bookmark for user: {user_id}") + + data = request.get_json() + news_id = data.get('news_id') + logger.debug(f"News article ID: {news_id}") + + if not news_id: + logger.warning("News article ID missing in request") + return {'error': 'News article ID is required'}, 400 + + logger.info(f"Adding bookmark for user {user_id}, article {news_id}") + bookmark = add_bookmark(user_id, news_id) + logger.debug(f"Bookmark added with ID: {bookmark['id'] if isinstance(bookmark, dict) else bookmark}") + + return { + 'status': 'success', + 'message': 'Bookmark added successfully', + 'data': { + 'bookmark_id': bookmark['id'] if isinstance(bookmark, dict) else bookmark + } + }, 201 + + except Exception as e: + logger.error(f"Error adding bookmark: {str(e)}") + return { + 'status': 'error', + 'message': str(e) + }, 500 + +@bookmark_ns.route('/') +class BookmarkDelete(Resource): + @token_required + def delete(self, bookmark_id): + """Remove a bookmark for a news article. + + Requires a valid JWT token in the Authorization header. + Deletes the specified bookmark for the authenticated user. + + Args: + bookmark_id (str): The ID of the bookmark to be deleted. + + Returns: + dict: Contains success message. + int: HTTP 200 on success, 500 on error. + """ + try: + logger.info(f"Delete bookmark endpoint called for bookmark: {bookmark_id}") + auth_header = request.headers.get('Authorization') + token = auth_header.split()[1] + logger.debug(f"Decoding token: {token[:10]}...") + payload = jwt.decode(token, current_app.config['SECRET_KEY'], algorithms=['HS256'], audience='authenticated') + user_id = payload.get('sub') + logger.info(f"Deleting bookmark {bookmark_id} for user {user_id}") + + result = delete_bookmark(user_id, bookmark_id) + logger.debug(f"Deletion result: {result}") + + return { + 'status': 'success', + 'message': 'Bookmark removed successfully' + }, 200 + + except Exception as e: + logger.error(f"Error removing bookmark: {str(e)}") + return { + 'status': 'error', + 'message': str(e) + }, 500 \ No newline at end of file diff --git a/backend/api_gateway/routes/health.py b/backend/api_gateway/routes/health.py new file mode 100644 index 0000000..9a3263d --- /dev/null +++ b/backend/api_gateway/routes/health.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +""" +Health API Routes + +This module contains the API routes for health check operations. +""" + +# Standard library imports +from flask import jsonify, request +from flask_restx import Resource, Namespace +from backend.core.utils import setup_logger + +# Initialize logger +logger = setup_logger(__name__) + +# Create health namespace +health_ns = Namespace('health', description='Health check operations') + +@health_ns.route('/') +class HealthCheck(Resource): + def get(self): + """Check the health status of the API Gateway. + + Returns: + dict: A dictionary containing the health status. + int: HTTP 200 status code indicating success. + """ + logger.info("Health check endpoint called") + return {"status": "API Gateway is healthy"}, 200 \ No newline at end of file diff --git a/backend/api_gateway/routes/news.py b/backend/api_gateway/routes/news.py new file mode 100644 index 0000000..a2fa668 --- /dev/null +++ b/backend/api_gateway/routes/news.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 +""" +News API Routes + +This module contains the API routes for news operations including fetching and processing. +""" + +# Standard library imports +from flask import jsonify, request, make_response +from flask_restx import Resource, Namespace +import jwt + +# Import microservices and utilities +from backend.microservices.news_fetcher import fetch_news +from backend.microservices.news_storage import store_article_in_supabase, log_user_search +from backend.microservices.summarization_service import process_articles +from backend.core.utils import setup_logger + +# Initialize logger +logger = setup_logger(__name__) + +# Create news namespace +news_ns = Namespace('api/news', description='News operations') + +@news_ns.route('/fetch') +class NewsFetch(Resource): + @news_ns.param('keyword', 'Search keyword for news') + @news_ns.param('user_id', 'User ID for logging search history') + @news_ns.param('session_id', 'Session ID for tracking requests') + def get(self): + """Fetch news articles based on a keyword and store them in Supabase. + + This endpoint fetches news articles matching the provided keyword, + stores them in Supabase, and logs the search history if a user ID is provided. + + Args: + keyword (str): The search term for fetching news articles. + user_id (str, optional): User ID for logging search history. + session_id (str): Session ID for tracking the request. + + Returns: + dict: Contains the stored article IDs and success status. + int: HTTP 200 on success, 500 on error. + """ + try: + keyword = request.args.get('keyword', '') + user_id = request.args.get('user_id') # optional + session_id = request.args.get('session_id') + logger.info(f"News fetch endpoint called with keyword: '{keyword}', user_id: {user_id}, session_id: {session_id}") + + logger.info(f"Fetching news articles for keyword: '{keyword}'") + articles = fetch_news(keyword) # This returns a list of articles. + logger.info(f"Found {len(articles) if articles else 0} articles for keyword: '{keyword}'") + + stored_article_ids = [] + + for article in articles: + logger.debug(f"Storing article: {article.get('title', 'No title')}") + article_id = store_article_in_supabase(article) + stored_article_ids.append(article_id) + logger.debug(f"Stored article with ID: {article_id}") + + if user_id: + logger.debug(f"Logging search for user {user_id}, article {article_id}") + log_user_search(user_id, article_id, session_id) + + logger.info(f"Returning {len(stored_article_ids)} article IDs") + return make_response(jsonify({ + 'status': 'success', + 'data': stored_article_ids + }), 200) + + except Exception as e: + logger.error(f"Error fetching news: {str(e)}") + return make_response(jsonify({ + 'status': 'error', + 'message': str(e) + }), 500) + +@news_ns.route('/process') +class NewsProcess(Resource): + @news_ns.param('session_id', 'Session ID for tracking requests (optional)') + def post(self): + """Process and summarize a batch of articles. + + This endpoint processes articles based on the provided article IDs in the request body, + generating summaries and checking bookmark status for the user if authenticated. + + Returns: + dict: Contains processed articles data and success status. + int: HTTP 200 on success, 500 on error. + """ + try: + session_id = request.args.get('session_id') + + # Try to get user_id from JWT token if it exists + user_id = None + auth_header = request.headers.get('Authorization') + if auth_header: + try: + token = auth_header.split()[1] # Extract token from 'Bearer ' + # Note: The secret key should be imported from the main app + from flask import current_app + payload = jwt.decode(token, current_app.config['SECRET_KEY'], algorithms=['HS256'], audience='authenticated') + user_id = payload.get('sub') + logger.debug(f"Extracted user_id from token: {user_id}") + except Exception as e: + logger.warning(f"Could not extract user_id from token: {str(e)}") + + logger.info(f"News process endpoint called with session_id: {session_id}, user_id: {user_id}") + + # Get article_ids from request body + request_data = request.get_json() + article_ids = request_data.get('article_ids', []) + + logger.debug(f"Article IDs from request: {article_ids}") + + if not article_ids: + return { + 'status': 'error', + 'message': 'No article IDs provided in request body' + }, 400 + + logger.info("Processing articles...") + summarized_articles = process_articles(article_ids, user_id) + logger.info(f"Processed {len(summarized_articles) if summarized_articles else 0} articles") + + return { + 'status': 'success', + 'message': 'Articles processed and summarized successfully', + 'data': summarized_articles, + 'session_id': session_id + }, 200 + + except Exception as e: + logger.error(f"Error processing articles: {str(e)}") + return { + 'status': 'error', + 'message': str(e) + }, 500 \ No newline at end of file diff --git a/backend/api_gateway/routes/story_tracking.py b/backend/api_gateway/routes/story_tracking.py new file mode 100644 index 0000000..dcba24e --- /dev/null +++ b/backend/api_gateway/routes/story_tracking.py @@ -0,0 +1,426 @@ +#!/usr/bin/env python3 +""" +Story Tracking API Routes + +This module contains the API routes for story tracking operations including +creating, retrieving, updating, and deleting tracked stories. +""" + +# Standard library imports +from flask import jsonify, request, make_response +from flask_restx import Resource, Namespace +import jwt +from datetime import datetime +import os + +# Import microservices and utilities +from backend.microservices.news_fetcher import fetch_news +from backend.microservices.news_storage import store_article_in_supabase +from backend.microservices.story_tracking_service import ( + get_tracked_stories, + create_tracked_story, + get_story_details, + delete_tracked_story, + toggle_polling +) +from backend.core.utils import setup_logger + +# Initialize logger +logger = setup_logger(__name__) + +# Create story tracking namespace +story_tracking_ns = Namespace('api/story_tracking', description='Story tracking operations') + +# Import token_required decorator from utils +from backend.api_gateway.utils.auth import token_required + +@story_tracking_ns.route('') +class StoryTracking(Resource): + @story_tracking_ns.param('keyword', 'Keyword to track for news updates') + def get(self): + """Fetch latest news for a tracked keyword. + + Retrieves and processes the latest news articles for a given keyword. + + Args: + keyword (str): The keyword to search for news articles. + + Returns: + dict: Contains list of processed articles and success status. + int: HTTP 200 on success, 400 if keyword is missing, 500 on error. + """ + try: + logger.debug("Story tracking get endpoint called") + keyword = request.args.get('keyword') + logger.debug(f"Requested keyword: '{keyword}'") + if not keyword: + logger.warning("Keyword parameter missing") + return make_response(jsonify({ + 'status': 'error', + 'message': 'Keyword parameter is required' + }), 400) + + logger.info(f"Fetching news for keyword: '{keyword}'") + articles = fetch_news(keyword) + logger.info(f"Found {len(articles) if articles else 0} articles for keyword: '{keyword}'") + + + processed_articles = [] + for article in articles: + logger.debug(f"Processing article: {article.get('title', 'No title')}") + article_id = store_article_in_supabase(article) + logger.debug(f"Stored article with ID: {article_id}") + processed_articles.append({ + 'id': article_id, + 'title': article.get('title'), + 'url': article.get('url'), + 'source': article.get('source', {}).get('name') if isinstance(article.get('source'), dict) else article.get('source'), + 'publishedAt': article.get('publishedAt', datetime.now().isoformat()) + }) + + logger.info(f"Returning {len(processed_articles)} processed articles") + return make_response(jsonify({ + 'status': 'success', + 'articles': processed_articles + }), 200) + + except Exception as e: + logger.error(f"Error in story tracking: {str(e)}") + return make_response(jsonify({ + 'status': 'error', + 'message': str(e) + }), 500) + + @token_required + def post(self): + """Create a new tracked story. + + Requires a valid JWT token in the Authorization header. + Creates a new tracked story for the authenticated user based on a keyword and source article. + + Expected JSON payload: + { + 'keyword': str (required), + 'sourceArticleId': str (optional) + } + + Returns: + dict: Contains created story details and success status. + int: HTTP 201 on success, 400 on validation error, 500 on server error. + """ + try: + logger.debug("Story tracking post endpoint called") + auth_header = request.headers.get('Authorization') + token = auth_header.split()[1] + logger.debug(f"Decoding token: {token[:10]}...") + # Import app from main module to access config + from flask import current_app + payload = jwt.decode(token, current_app.config['SECRET_KEY'], algorithms=['HS256'], audience='authenticated') + user_id = payload.get('sub') + logger.info(f"Creating tracked story for user: {user_id}") + + data = request.get_json() + keyword = data.get('keyword') + source_article_id = data.get('sourceArticleId') + logger.debug(f"Story details - Keyword: '{keyword}', Source article: {source_article_id}") + + if not keyword: + logger.warning("Keyword parameter missing in request") + return make_response(jsonify({ + 'status': 'error', + 'message': 'Keyword is required' + }), 400) + + logger.debug(f"Calling create_tracked_story with user_id: {user_id}, keyword: '{keyword}'") + tracked_story = create_tracked_story(user_id, keyword, source_article_id) + logger.info(f"Tracked story created with ID: {tracked_story['id'] if tracked_story else 'unknown'}") + + logger.debug(f"Getting full story details for story: {tracked_story['id']}") + story_with_articles = get_story_details(tracked_story['id']) + logger.info(f"Found {len(story_with_articles.get('articles', [])) if story_with_articles else 0} related articles") + + return make_response(jsonify({ + 'status': 'success', + 'data': story_with_articles + }), 201) + + except Exception as e: + logger.error(f"Error creating tracked story: {str(e)}") + return make_response(jsonify({ + 'status': 'error', + 'message': str(e) + }), 500) + +@story_tracking_ns.route('', methods=['OPTIONS']) +class StoryTrackingOptions(Resource): + def options(self): + """Handle OPTIONS requests for the story tracking endpoint. + + This function sets the necessary CORS headers for preflight requests + to the story tracking endpoint. + + Returns: + Response: A Flask response object with appropriate CORS headers. + """ + print("[DEBUG] [api_gateway] [story_tracking_options] Called") + response = make_response() + response.headers.add("Access-Control-Allow-Origin", "*") + response.headers.add("Access-Control-Allow-Headers", "Content-Type,Authorization") + response.headers.add("Access-Control-Allow-Methods", "GET,POST,PUT,DELETE,OPTIONS") + print("[DEBUG] [api_gateway] [story_tracking_options] Responding with CORS headers") + return response + +@story_tracking_ns.route('/start') +class StartStoryTracking(Resource): + @token_required + def post(self): + """Start polling for a tracked story. + Requires a valid JWT token in the Authorization header. + Enables polling for a specific tracked story. + Expected JSON payload: + { + 'story_id': str (required) + } + Returns: + dict: Contains updated story details and success status. + int: HTTP 200 on success, 400 on validation error, 404 if story not found, 500 on server error. + """ + try: + logger.debug("Start story tracking endpoint called") + auth_header = request.headers.get('Authorization') + token = auth_header.split()[1] + logger.debug(f"Decoding token: {token[:10]}...") + # Import app from main module to access config + from flask import current_app + payload = jwt.decode(token, current_app.config['SECRET_KEY'], algorithms=['HS256'], audience='authenticated') + user_id = payload.get('sub') + logger.info(f"Starting polling for user: {user_id}") + + data = request.get_json() + story_id = data.get('story_id') + logger.debug(f"Story ID: {story_id}") + + if not story_id: + logger.warning("Story ID missing in request") + return make_response(jsonify({ + 'status': 'error', + 'message': 'Story ID is required' + }), 400) + + logger.debug(f"Calling toggle_polling with user_id: {user_id}, story_id: {story_id}, enable=True") + updated_story = toggle_polling(user_id, story_id, enable=True) + + if not updated_story: + logger.warning(f"No story found with ID {story_id} for user {user_id}") + return make_response(jsonify({ + 'status': 'error', + 'message': 'Story not found or unauthorized' + }), 404) + + logger.info(f"Polling started for story: {story_id}") + return make_response(jsonify({ + 'status': 'success', + 'message': 'Polling started successfully', + 'data': updated_story + }), 200) + + except Exception as e: + logger.error(f"Error starting polling: {str(e)}") + return make_response(jsonify({ + 'status': 'error', + 'message': str(e) + }), 500) + +@story_tracking_ns.route('/stop') +class StopStoryTracking(Resource): + @token_required + def post(self): + """Stop polling for a tracked story. + Requires a valid JWT token in the Authorization header. + Disables polling for a specific tracked story. + Expected JSON payload: + { + 'story_id': str (required) + } + Returns: + dict: Contains updated story details and success status. + int: HTTP 200 on success, 400 on validation error, 404 if story not found, 500 on server error. + """ + try: + logger.debug("Stop story tracking endpoint called") + auth_header = request.headers.get('Authorization') + token = auth_header.split()[1] + logger.debug(f"Decoding token: {token[:10]}...") + # Import app from main module to access config + from flask import current_app + payload = jwt.decode(token, current_app.config['SECRET_KEY'], algorithms=['HS256'], audience='authenticated') + user_id = payload.get('sub') + logger.info(f"Stopping polling for user: {user_id}") + + data = request.get_json() + story_id = data.get('story_id') + logger.debug(f"Story ID: {story_id}") + + if not story_id: + logger.warning("Story ID missing in request") + return make_response(jsonify({ + 'status': 'error', + 'message': 'Story ID is required' + }), 400) + + logger.debug(f"Calling toggle_polling with user_id: {user_id}, story_id: {story_id}, enable=False") + updated_story = toggle_polling(user_id, story_id, enable=False) + + if not updated_story: + logger.warning(f"No story found with ID {story_id} for user {user_id}") + return make_response(jsonify({ + 'status': 'error', + 'message': 'Story not found or unauthorized' + }), 404) + + logger.info(f"Polling stopped for story: {story_id}") + return make_response(jsonify({ + 'status': 'success', + 'message': 'Polling stopped successfully', + 'data': updated_story + }), 200) + + except Exception as e: + logger.error(f"Error stopping polling: {str(e)}") + return make_response(jsonify({ + 'status': 'error', + 'message': str(e) + }), 500) + +@story_tracking_ns.route('/user') +class UserStoryTracking(Resource): + @token_required + def get(self): + """Get all tracked stories for the authenticated user. + + Requires a valid JWT token in the Authorization header. + Retrieves all tracked stories associated with the authenticated user. + + Returns: + dict: Contains list of tracked stories and success status. + int: HTTP 200 on success, 500 on error. + """ + try: + logger.debug("User story tracking endpoint called") + auth_header = request.headers.get('Authorization') + token = auth_header.split()[1] + logger.debug(f"Decoding token: {token[:10]}...") + # Import app from main module to access config + from flask import current_app + payload = jwt.decode(token, current_app.config['SECRET_KEY'], algorithms=['HS256'], audience='authenticated') + user_id = payload.get('sub') + logger.info(f"Getting tracked stories for user: {user_id}") + + logger.debug(f"Calling get_tracked_stories") + tracked_stories = get_tracked_stories(user_id) + logger.info(f"Found {len(tracked_stories)} tracked stories") + + return make_response(jsonify({ + 'status': 'success', + 'data': tracked_stories + }), 200) + + except Exception as e: + logger.error(f"Error getting tracked stories: {str(e)}") + return make_response(jsonify({ + 'status': 'error', + 'message': str(e) + }), 500) + +@story_tracking_ns.route('/') +class StoryTrackingDetail(Resource): + @token_required + def get(self, story_id): + """Get details for a specific tracked story. + + Requires a valid JWT token in the Authorization header. + Retrieves detailed information about a specific tracked story. + + Args: + story_id (str): The ID of the tracked story to retrieve. + + Returns: + dict: Contains story details and success status. + int: HTTP 200 on success, 404 if story not found, 500 on error. + """ + try: + logger.debug(f"Story tracking detail endpoint called for story: {story_id}") + logger.debug(f"Calling get_story_details for story: {story_id}") + story = get_story_details(story_id) + + if not story: + logger.warning(f"No story found with ID: {story_id}") + return make_response(jsonify({ + 'status': 'error', + 'message': 'Tracked story not found' + }), 404) + + logger.info(f"Found story: {story['keyword']}") + logger.debug(f"Story has {len(story.get('articles', []))} articles") + return make_response(jsonify({ + 'status': 'success', + 'data': story + }), 200) + + except Exception as e: + logger.error(f"Error getting story details: {str(e)}") + return make_response(jsonify({ + 'status': 'error', + 'message': str(e) + }), 500) + + @token_required + def delete(self, story_id): + """Stop tracking a story. + + Requires a valid JWT token in the Authorization header. + Deletes a tracked story for the authenticated user. + + Args: + story_id (str): The ID of the tracked story to delete. + + Returns: + dict: Contains success message. + int: HTTP 200 on success, 404 if story not found, 500 on error. + """ + try: + logger.debug(f"Delete story tracking endpoint called for story: {story_id}") + auth_header = request.headers.get('Authorization') + token = auth_header.split()[1] + logger.debug(f"Decoding token: {token[:10]}...") + # Import app from main module to access config + from flask import current_app + payload = jwt.decode(token, current_app.config['SECRET_KEY'], algorithms=['HS256'], audience='authenticated') + user_id = payload.get('sub') + logger.info(f"Deleting tracked story {story_id} for user {user_id}") + + logger.debug(f"Calling delete_tracked_story") + success = delete_tracked_story(user_id, story_id) + logger.debug(f"Delete result: {success}") + + if not success: + logger.warning(f"Failed to delete story or story not found") + return make_response(jsonify({ + 'status': 'error', + 'message': 'Failed to delete tracked story or story not found' + }), 404) + + logger.info(f"Story deleted successfully") + return make_response(jsonify({ + 'status': 'success', + 'message': 'Tracked story deleted successfully' + }), 200) + + except Exception as e: + logger.error(f"Error deleting tracked story: {str(e)}") + return make_response(jsonify({ + 'status': 'error', + 'message': str(e) + }), 500) + + diff --git a/backend/api_gateway/routes/summarize.py b/backend/api_gateway/routes/summarize.py new file mode 100644 index 0000000..a2740ed --- /dev/null +++ b/backend/api_gateway/routes/summarize.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +""" +Summarization API Routes + +This module contains the API routes for text summarization operations. +""" + +# Standard library imports +from flask import request +from flask_restx import Resource, Namespace, fields + +# Import microservices and utilities +from backend.microservices.summarization_service import run_summarization +from backend.core.utils import setup_logger + +# Initialize logger +logger = setup_logger(__name__) + +# Create summarize namespace +summarize_ns = Namespace('summarize', description='Text summarization operations') + +# Define API models for request/response documentation +article_model = summarize_ns.model('Article', { + 'article_text': fields.String(required=True, description='The text to summarize') +}) + +@summarize_ns.route('/') +class Summarize(Resource): + @summarize_ns.expect(article_model) + def post(self): + """Summarize the provided article text. + + Expects a JSON payload with 'article_text' field. + Uses the summarization service to generate a concise summary. + + Returns: + dict: Contains the generated summary. + int: HTTP 200 status code on success. + """ + logger.info("Summarize endpoint called") + data = request.get_json() + article_text = data.get('article_text', '') + logger.debug(f"Summarizing text of length: {len(article_text)}") + summary = run_summarization(article_text) + logger.debug(f"Summarization complete, summary length: {len(summary)}") + return {"summary": summary}, 200 \ No newline at end of file diff --git a/backend/api_gateway/routes/user.py b/backend/api_gateway/routes/user.py new file mode 100644 index 0000000..712d832 --- /dev/null +++ b/backend/api_gateway/routes/user.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +""" +User API Routes + +This module contains the API routes for user operations including profile management. +""" + +# Standard library imports +from flask import jsonify, request, make_response +from flask_restx import Resource, Namespace, fields +import jwt + +# Import microservices and utilities +from backend.microservices.auth_service import load_users +from functools import wraps +from flask import current_app +from backend.core.utils import setup_logger + +# Initialize logger +logger = setup_logger(__name__) + +# Create user namespace +user_ns = Namespace('api/user', description='User operations') + +# Define API models for request/response documentation +user_profile_model = user_ns.model('UserProfile', { + 'id': fields.String(description='User ID'), + 'username': fields.String(description='Username'), + 'email': fields.String(description='Email address'), + 'firstName': fields.String(description='First name'), + 'lastName': fields.String(description='Last name'), + 'avatarUrl': fields.String(description='URL to user avatar') +}) + +# Import token_required decorator from utils +from backend.api_gateway.utils.auth import token_required + +@user_ns.route('/profile') +class UserProfile(Resource): + @token_required + @user_ns.marshal_with(user_profile_model) + def get(self): + """Retrieve authenticated user's profile information. + + Requires a valid JWT token in the Authorization header. + Returns the user's profile data excluding sensitive information. + + Returns: + dict: User profile data including id, username, email, and names. + int: HTTP 200 on success, 404 if user not found. + """ + logger.info("User profile endpoint called") + auth_header = request.headers.get('Authorization') + token = auth_header.split()[1] + logger.debug(f"Decoding token: {token[:10]}...") + payload = jwt.decode(token, current_app.config['SECRET_KEY'], algorithms=['HS256'], audience='authenticated') + logger.debug(f"Looking up user with ID: {payload.get('sub')}") + + users = load_users() + user = next((u for u in users if u.get('id') == payload.get('sub')), None) + if not user: + logger.warning(f"User not found with ID: {payload.get('sub')}") + return {'error': 'User not found'}, 404 + + logger.debug(f"Found user: {user.get('username')}") + return {k: user[k] for k in user if k != 'password'}, 200 \ No newline at end of file diff --git a/backend/api_gateway/utils/__init__.py b/backend/api_gateway/utils/__init__.py new file mode 100644 index 0000000..ce805ff --- /dev/null +++ b/backend/api_gateway/utils/__init__.py @@ -0,0 +1 @@ +# This file makes the utils directory a Python package \ No newline at end of file diff --git a/backend/api_gateway/utils/auth.py b/backend/api_gateway/utils/auth.py new file mode 100644 index 0000000..2516d21 --- /dev/null +++ b/backend/api_gateway/utils/auth.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +""" +Authentication Utilities + +This module provides authentication utilities for the News Aggregator API Gateway, +including the token_required decorator for protecting routes that require authentication. +""" + +# Standard library imports +from flask import request +from functools import wraps +import jwt + +# Import Flask app for accessing config +from flask import current_app + +def token_required(f): + """Decorator to protect routes that require authentication. + + This decorator validates the JWT token in the Authorization header. + It ensures that only authenticated users can access protected endpoints. + + Args: + f: The function to be decorated. + + Returns: + decorated: The decorated function that includes token validation. + + Raises: + 401: If the token is missing or invalid. + """ + @wraps(f) + def decorated(*args, **kwargs): + print("[DEBUG] [api_gateway] [token_required] Checking token in request") + auth_header = request.headers.get('Authorization') + if not auth_header: + print("[DEBUG] [api_gateway] [token_required] Authorization header missing") + return {'error': 'Authorization header missing'}, 401 + try: + token = auth_header.split()[1] # Extract token from 'Bearer ' + print(f"[DEBUG] [api_gateway] [token_required] Decoding token: {token[:10]}...") + payload = jwt.decode(token, current_app.config['SECRET_KEY'], algorithms=['HS256'], audience='authenticated') + print(f"[DEBUG] [api_gateway] [token_required] Token decoded successfully, user: {payload.get('sub', 'unknown')}") + + return f(*args, **kwargs) + except Exception as e: + print(f"[DEBUG] [api_gateway] [token_required] Token validation error: {str(e)}") + return {'error': 'Invalid token', 'message': str(e)}, 401 + return decorated \ No newline at end of file diff --git a/backend/data/story_tracking_schema.sql b/backend/data/story_tracking_schema.sql index 60e3636..b464237 100644 --- a/backend/data/story_tracking_schema.sql +++ b/backend/data/story_tracking_schema.sql @@ -7,7 +7,9 @@ CREATE TABLE tracked_stories ( user_id UUID NOT NULL REFERENCES auth.users(id), keyword VARCHAR(255) NOT NULL, created_at TIMESTAMP NOT NULL DEFAULT NOW(), - last_updated TIMESTAMP NOT NULL DEFAULT NOW() + last_updated TIMESTAMP NOT NULL DEFAULT NOW(), + is_polling BOOLEAN NOT NULL DEFAULT FALSE, + last_polled_at TIMESTAMP ); -- Table for articles related to tracked stories @@ -21,6 +23,7 @@ CREATE TABLE tracked_story_articles ( -- Index for faster lookups CREATE INDEX idx_tracked_stories_user_id ON tracked_stories(user_id); CREATE INDEX idx_tracked_stories_keyword ON tracked_stories(keyword); +CREATE INDEX idx_tracked_stories_polling ON tracked_stories(is_polling); CREATE INDEX idx_tracked_story_articles_story_id ON tracked_story_articles(tracked_story_id); -- RLS Policies for tracked_stories diff --git a/backend/microservices/.DS_Store b/backend/microservices/.DS_Store index 677c187..cf5b4c5 100644 Binary files a/backend/microservices/.DS_Store and b/backend/microservices/.DS_Store differ diff --git a/backend/microservices/data_services/__init__.py b/backend/microservices/data_services/__init__.py new file mode 100644 index 0000000..8b89bf6 --- /dev/null +++ b/backend/microservices/data_services/__init__.py @@ -0,0 +1,8 @@ +# data_services package + +""" +Data Services Package + +This package contains modules for fetching and processing data from external sources. +It provides services for retrieving news articles and other content from APIs. +""" \ No newline at end of file diff --git a/backend/microservices/data_services/news_fetcher.py b/backend/microservices/data_services/news_fetcher.py new file mode 100644 index 0000000..bd074de --- /dev/null +++ b/backend/microservices/data_services/news_fetcher.py @@ -0,0 +1,115 @@ +"""News Fetcher Service + +This module is responsible for fetching news articles from the News API based on +keywords and managing the storage of fetched articles. It provides functionality +to search for news articles and optionally save them to files with session-based +organization. + +The module uses the News API (https://newsapi.org/) as its primary data source +and supports session-based article management for multi-user scenarios. + +Typical usage: + articles = fetch_news('technology') + write_to_file(articles, 'user_session_123') + +Environment Variables Required: + NEWS_API_KEY: API key for accessing the News API service +""" + +import os +import requests +from dotenv import load_dotenv +import json +from pathlib import Path +from backend.core.config import Config + +# Load environment variables from .env file for configuration +load_dotenv() + +# Initialize the News API key from environment variables +NEWS_API_KEY = os.getenv('NEWS_API_KEY') + +def fetch_news(keyword='', session_id=None): + """Fetch news articles from News API based on a keyword search. + + This function queries the News API to retrieve articles matching the provided + keyword. It supports session-based tracking of requests and can handle empty + keyword searches. + + Args: + keyword (str, optional): The search term to find relevant articles. + Defaults to empty string which returns top headlines. + session_id (str, optional): Unique identifier for the current user session. + Used for organizing saved articles. Defaults to None. + + Returns: + list: A list of dictionaries containing article data with fields like + 'title', 'description', 'url', etc. Returns None on error. + + Raises: + requests.exceptions.RequestException: If there's an error communicating + with the News API. + """ + # Configure the News API endpoint and request parameters + url = "https://newsapi.org/v2/everything" + params = { + 'q': keyword, # Search query parameter + 'apiKey': NEWS_API_KEY, + 'pageSize': 1 # Limit results to 10 articles per request + } + + try: + # Make a GET request to the News API + response = requests.get(url, params=params) + response.raise_for_status() + + # Process the response data + news_data = response.json() + if news_data.get('status') == 'ok': + articles = news_data.get('articles', []) + if not articles: + print("No articles found for the given keyword.") + + return articles + else: + print("Failed to fetch news:", news_data.get('message')) + + except requests.exceptions.RequestException as e: + print(f"Error fetching news: {e}") + +def write_to_file(articles, session_id=None): + """Save fetched news articles to a JSON file. + + This function stores the provided articles in a JSON file, organizing them + by session ID. It creates the necessary directories if they don't exist. + + Args: + articles (list): List of article dictionaries to save. + session_id (str, optional): Unique identifier for the current session. + Used to create a unique filename. Defaults to 'default' if None. + + Returns: + None + + Raises: + IOError: If there's an error writing to the file system. + """ + # Use default session ID if none provided + if not session_id: + session_id = 'default' + + # Generate a unique filename using the session ID + file_name = f'{session_id}_news_data.json' + + # Construct the full file path using the configured data directory + file_path = Config.NEWS_DATA_DIR / file_name + try: + # Save the articles as formatted JSON for better readability + with open(file_path, 'w') as file: + json.dump(articles, file, indent=4) + print(f"Articles successfully saved to {file_path}") + except IOError as e: + print(f"Error writing to file: {e}") + +if __name__ == '__main__': + fetch_news() \ No newline at end of file diff --git a/backend/microservices/ingestion_service.py b/backend/microservices/ingestion_service.py index 91c665b..8bfa951 100755 --- a/backend/microservices/ingestion_service.py +++ b/backend/microservices/ingestion_service.py @@ -1,45 +1,45 @@ -#!/usr/bin/env python3 -""" -ingestion_service.py - Microservice for Data Ingestion -Handles incoming data from scrapers, APIs, and RSS feeds, -and passes them to the processing layer. -""" +# #!/usr/bin/env python3 +# """ +# ingestion_service.py - Microservice for Data Ingestion +# Handles incoming data from scrapers, APIs, and RSS feeds, +# and passes them to the processing layer. +# """ -from flask import Flask, jsonify, request +# from flask import Flask, jsonify, request -app = Flask(__name__) +# app = Flask(__name__) -# Simple in-memory storage for demo -articles = {} -current_id = 1 +# # Simple in-memory storage for demo +# articles = {} +# current_id = 1 -@app.route('/api/news', methods=['GET', 'POST']) -def news(): - global current_id - if request.method == 'POST': - data = request.get_json() - articles[current_id] = data - response = {'id': current_id, 'message': 'Article created'} - current_id += 1 - return jsonify(response), 201 - else: - return jsonify(list(articles.values())), 200 +# @app.route('/api/news', methods=['GET', 'POST']) +# def news(): +# global current_id +# if request.method == 'POST': +# data = request.get_json() +# articles[current_id] = data +# response = {'id': current_id, 'message': 'Article created'} +# current_id += 1 +# return jsonify(response), 201 +# else: +# return jsonify(list(articles.values())), 200 -@app.route('/api/news/', methods=['GET']) -def news_by_id(article_id): - article = articles.get(article_id) - if article: - return jsonify(article), 200 - return jsonify({'error': 'Article not found'}), 404 +# @app.route('/api/news/', methods=['GET']) +# def news_by_id(article_id): +# article = articles.get(article_id) +# if article: +# return jsonify(article), 200 +# return jsonify({'error': 'Article not found'}), 404 -@app.route('/api/news/search', methods=['GET']) -def search(): - query = request.args.get('q', '').lower() - results = [ - article for article in articles.values() - if query in article.get('title', '').lower() or query in article.get('content', '').lower() - ] - return jsonify(results), 200 +# @app.route('/api/news/search', methods=['GET']) +# def search(): +# query = request.args.get('q', '').lower() +# results = [ +# article for article in articles.values() +# if query in article.get('title', '').lower() or query in article.get('content', '').lower() +# ] +# return jsonify(results), 200 -if __name__ == '__main__': - app.run(host='0.0.0.0', port=5002) +# if __name__ == '__main__': +# app.run(host='0.0.0.0', port=5002) diff --git a/backend/microservices/news_fetcher.py b/backend/microservices/news_fetcher.py index b5ab9d9..5db0ebc 100644 --- a/backend/microservices/news_fetcher.py +++ b/backend/microservices/news_fetcher.py @@ -1,129 +1,16 @@ -"""News Fetcher Service +"""News Fetcher Service (Compatibility Module) -This module is responsible for fetching news articles from the News API based on -keywords and managing the storage of fetched articles. It provides functionality -to search for news articles and optionally save them to files with session-based -organization. +This is a compatibility module that imports from the new location in data_services. +Existing code that imports from this location will continue to work. -The module uses the News API (https://newsapi.org/) as its primary data source -and supports session-based article management for multi-user scenarios. - -Typical usage: - articles = fetch_news('technology') - write_to_file(articles, 'user_session_123') - -Environment Variables Required: - NEWS_API_KEY: API key for accessing the News API service +For new code, please import directly from backend.microservices.data_services.news_fetcher """ -import os -import requests -from dotenv import load_dotenv -import json -from pathlib import Path -from backend.core.config import Config - -# Load environment variables from .env file for configuration -load_dotenv() - -# Initialize the News API key from environment variables -NEWS_API_KEY = os.getenv('NEWS_API_KEY') - -def fetch_news(keyword='', session_id=None): - """Fetch news articles from News API based on a keyword search. - - This function queries the News API to retrieve articles matching the provided - keyword. It supports session-based tracking of requests and can handle empty - keyword searches. - - Args: - keyword (str, optional): The search term to find relevant articles. - Defaults to empty string which returns top headlines. - session_id (str, optional): Unique identifier for the current user session. - Used for organizing saved articles. Defaults to None. - - Returns: - list: A list of dictionaries containing article data with fields like - 'title', 'description', 'url', etc. Returns None on error. - - Raises: - requests.exceptions.RequestException: If there's an error communicating - with the News API. - """ - # Configure the News API endpoint and request parameters - url = "https://newsapi.org/v2/everything" - params = { - 'q': keyword, # Search query parameter - 'apiKey': NEWS_API_KEY, - 'pageSize': 1 # Limit results to 10 articles per request - } - - try: - # Make a GET request to the News API - response = requests.get(url, params=params) - response.raise_for_status() - - # Process the response data - news_data = response.json() - if news_data.get('status') == 'ok': - articles = news_data.get('articles', []) - if not articles: - print("No articles found for the given keyword.") - else: - pass - # Use session_id in the filename if provided - # if session_id: - # write_to_file(articles, session_id) - # else: - # write_to_file(articles) - # for article in articles: - # print(f"Title: {article['title']}") - # print(f"Description: {article['description']}") - # print(f"URL: {article['url']}\n") - - return articles - else: - print("Failed to fetch news:", news_data.get('message')) - - except requests.exceptions.RequestException as e: - print(f"Error fetching news: {e}") - -def write_to_file(articles, session_id=None): - """Save fetched news articles to a JSON file. - - This function stores the provided articles in a JSON file, organizing them - by session ID. It creates the necessary directories if they don't exist. - - Args: - articles (list): List of article dictionaries to save. - session_id (str, optional): Unique identifier for the current session. - Used to create a unique filename. Defaults to 'default' if None. - - Returns: - None - - Raises: - IOError: If there's an error writing to the file system. - """ - # Use default session ID if none provided - if not session_id: - session_id = 'default' - - # Generate a unique filename using the session ID - file_name = f'{session_id}_news_data.json' - - # Construct the full file path using the configured data directory - file_path = Config.NEWS_DATA_DIR / file_name - try: - # Save the articles as formatted JSON for better readability - with open(file_path, 'w') as file: - json.dump(articles, file, indent=4) - print(f"Articles successfully saved to {file_path}") - except IOError as e: - print(f"Error writing to file: {e}") +# Import all functions from the new location to maintain backward compatibility +from backend.microservices.data_services.news_fetcher import fetch_news, write_to_file -if __name__ == '__main__': - fetch_news() +# Re-export the functions to maintain the same interface +__all__ = ['fetch_news', 'write_to_file'] diff --git a/backend/microservices/news_storage.py b/backend/microservices/news_storage.py index c9eedf4..cad067d 100644 --- a/backend/microservices/news_storage.py +++ b/backend/microservices/news_storage.py @@ -3,13 +3,15 @@ News Storage Service - Supabase Database Integration Module This module provides functions for storing and retrieving news articles and user interactions -with the Supabase database. It handles article storage, user search history logging, and bookmark -management operations. +with the Supabase database. It handles article storage and imports user search history logging +and bookmark management operations from dedicated modules. The module uses the Supabase client to interact with the following tables: - news_articles: Stores article content and metadata -- user_search_history: Tracks user search interactions -- user_bookmarks: Manages user article bookmarks + +Other functionality has been moved to dedicated modules: +- User search history: storage/search_logger.py +- Bookmark management: storage/bookmark_service.py Environment Variables Required: - VITE_SUPABASE_URL: Supabase project URL @@ -18,9 +20,22 @@ import os import datetime +import logging from supabase import create_client, Client from dotenv import load_dotenv +# Import functions from storage modules +from backend.microservices.storage.search_logger import log_user_search +from backend.microservices.storage.bookmark_service import ( + add_bookmark, + get_user_bookmarks, + delete_bookmark +) + +# Initialize logger +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + # Load environment variables from .env file load_dotenv('../../.env') @@ -29,6 +44,8 @@ SUPABASE_SERVICE_KEY = os.getenv("VITE_SUPABASE_ANON_KEY") # Using anon key for server-side operations supabase: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY) +logger.info("News Storage Service initialized with Supabase configuration") + def store_article_in_supabase(article): """ Inserts a news article into the Supabase news_articles table if it doesn't already exist. @@ -51,158 +68,33 @@ def store_article_in_supabase(article): Returns: str: The ID of the article (either existing or newly created) """ - # Check if the article already exists using the URL as unique identifier - existing = supabase.table("news_articles").select("*").eq("url", article["url"]).execute() - if existing.data and len(existing.data) > 0: - # Article already exists; return its id - return existing.data[0]["id"] - else: - # Insert a new article with all available fields - result = supabase.table("news_articles").insert({ - "title": article["title"], - "summary": article.get("summary", ""), - "content": article.get("content", ""), - # Handle source field which can be a dict (from API) or a plain string - "source": article["source"]["name"] if isinstance(article.get("source"), dict) else article["source"], - "published_at": article["publishedAt"], - "url": article["url"], - "image": article.get("urlToImage", "") - }).execute() - return result.data[0]["id"] - -def log_user_search(user_id, news_id, session_id): - """ - Logs a search event by inserting a record into the user_search_history join table. - - This function creates a record of a user viewing or searching for a specific article, - which can be used for analytics, personalization, and tracking user activity across sessions. - - Args: - user_id (str): The ID of the user performing the search - news_id (str): The ID of the news article that was viewed/searched - session_id (str): The current session identifier for tracking user activity - - Returns: - dict: The Supabase response object containing the result of the insert operation - """ - # Create a timestamp for when the search occurred - current_time = datetime.datetime.utcnow().isoformat() - - # Insert the search record with all required fields - result = supabase.table("user_search_history").insert({ - "user_id": user_id, - "news_id": news_id, - "searched_at": current_time, - "session_id": session_id, - }).execute() - return result - -def add_bookmark(user_id, news_id): - """ - Adds a bookmark by inserting a record into the user_bookmarks table. - - This function creates a bookmark relationship between a user and a news article, - allowing users to save articles for later reading. - - Args: - user_id (str): The ID of the user adding the bookmark - news_id (str): The ID of the news article to bookmark - - Returns: - dict or None: The created bookmark record if successful, None otherwise - - Raises: - Exception: If there's an error during the database operation - """ - try: - # Insert a new bookmark record linking user to article - result = supabase.table("user_bookmarks").insert({ - "user_id": user_id, - "news_id": news_id, - }).execute() - - # Return the first data item if available, otherwise None - return result.data[0] if result.data else None - except Exception as e: - print(f"Error adding bookmark: {str(e)}") - # Re-raise the exception for proper error handling upstream - raise e - -def get_user_bookmarks(user_id): - """ - Retrieves all bookmarked articles for a user with full article details. - - This function performs a join between the user_bookmarks table and the news_articles table - to retrieve complete article information for all articles bookmarked by the specified user. - The results are transformed into a more user-friendly format where each article includes its - bookmark_id for reference. + logger.debug(f"Attempting to store article: {article.get('title')} from {article.get('url')}") - Args: - user_id (str): The ID of the user whose bookmarks should be retrieved - - Returns: - list: A list of dictionaries, each containing the full details of a bookmarked article - with an additional 'bookmark_id' field - - Raises: - Exception: If there's an error during the database operation - """ + # Check if the article already exists using the URL as unique identifier try: - # Query user_bookmarks and join with news_articles to get full article details - # This uses Supabase's foreign key relationships to perform the join - result = supabase.table("user_bookmarks") \ - .select( - "id," - "news_articles(id,title,summary,content,source,published_at,url,image)" - ) \ - .eq("user_id", user_id) \ - .execute() - - # Transform the nested result structure to a more friendly format - # by flattening the news_articles data and adding the bookmark_id - bookmarks = [] - for item in result.data: - article = item["news_articles"] - article["bookmark_id"] = item["id"] # Add bookmark ID to article for reference - bookmarks.append(article) - - return bookmarks + existing = supabase.table("news_articles").select("*").eq("url", article["url"]).execute() + if existing.data and len(existing.data) > 0: + # Article already exists; return its id + logger.info(f"Article already exists with ID: {existing.data[0]['id']}") + return existing.data[0]["id"] + else: + # Insert a new article with all available fields + logger.debug("Article not found in database, proceeding with insertion") + result = supabase.table("news_articles").insert({ + "title": article["title"], + "summary": article.get("summary", ""), + "content": article.get("content", ""), + # Handle source field which can be a dict (from API) or a plain string + "source": article["source"]["name"] if isinstance(article.get("source"), dict) else article["source"], + "published_at": article["publishedAt"], + "url": article["url"], + "image": article.get("urlToImage", "") + }).execute() + logger.info(f"Successfully stored new article with ID: {result.data[0]['id']}") + return result.data[0]["id"] except Exception as e: - print(f"Error fetching bookmarks: {str(e)}") - # Re-raise the exception for proper error handling upstream - raise e + logger.error(f"Error storing article in Supabase: {str(e)}") + raise -def delete_bookmark(user_id, bookmark_id): - """ - Deletes a bookmark from the user_bookmarks table. - - This function removes a bookmark relationship between a user and an article. - It ensures that users can only delete their own bookmarks by checking both the - bookmark_id and user_id in the query. - - Args: - user_id (str): The ID of the user who owns the bookmark - bookmark_id (str): The ID of the bookmark to delete - - Returns: - bool: True if the bookmark was successfully deleted, False if no bookmark was found - or if the deletion was unsuccessful - - Raises: - Exception: If there's an error during the database operation - """ - try: - # Delete the bookmark, ensuring it belongs to the specified user - # This double condition prevents users from deleting other users' bookmarks - result = supabase.table("user_bookmarks") \ - .delete() \ - .eq("id", bookmark_id) \ - .eq("user_id", user_id) \ - .execute() - - # Return True if at least one record was deleted, False otherwise - return len(result.data) > 0 - except Exception as e: - print(f"Error deleting bookmark: {str(e)}") - # Re-raise the exception for proper error handling upstream - raise e \ No newline at end of file +# The functions log_user_search, add_bookmark, get_user_bookmarks, and delete_bookmark +# have been moved to dedicated modules in the storage directory and are now imported above \ No newline at end of file diff --git a/backend/microservices/nope.env b/backend/microservices/nope.env deleted file mode 100644 index 9e0a1db..0000000 --- a/backend/microservices/nope.env +++ /dev/null @@ -1,17 +0,0 @@ -# API Configuration -API_HOST=localhost -API_PORT=5001 - -# CORS Configurationasd -CORS_ORIGINS=http://localhost:5173,http://localhost:3000,http://localhost:5001 - -# Redis Configuration -REDIS_HOST=localhost -REDIS_PORT=6379 - -# API Keys -NEWS_API_KEY=4b94554081e148bc964e4ab94c9dc0fe -OPENAI_API_KEY=your_openai_api_key_here - -# Logging -LOG_LEVEL=INFO \ No newline at end of file diff --git a/backend/microservices/polling_worker.py b/backend/microservices/polling_worker.py new file mode 100644 index 0000000..5af3476 --- /dev/null +++ b/backend/microservices/polling_worker.py @@ -0,0 +1,390 @@ +#!/usr/bin/env python3 +""" +polling_worker.py - Worker for automatic news article polling + +This script runs as a background process or scheduled task that periodically: +1. Queries Supabase for tracked stories with polling enabled +2. Fetches new articles for each story's keyword +3. Stores new articles and links them to the tracked story +4. Updates the last_polled_at timestamp + +Usage: +- Run directly: python polling_worker.py +- Schedule with cron or a process manager + +Environment Variables Required: +- VITE_SUPABASE_URL: Supabase project URL +- SUPABASE_SERVICE_ROLE_KEY: Service role key for admin access +- NEWS_API_KEY: API key for the news service +- POLLING_INTERVAL: Time in minutes between polling cycles (default: 5) +""" + +import os +import time +import datetime +import schedule +import logging +import requests +from supabase import create_client, Client +from dotenv import load_dotenv + +# Set up logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s [%(levelname)s] [polling_worker] %(message)s', + handlers=[ + logging.StreamHandler() + ] +) +logger = logging.getLogger(__name__) + +# Load environment variables from .env file +load_dotenv() +logger.info("Environment variables loaded") + +# Initialize Supabase client with service role key for admin access +SUPABASE_URL = os.getenv("VITE_SUPABASE_URL") +SUPABASE_SERVICE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY") +NEWS_API_KEY = os.getenv("NEWS_API_KEY") +POLLING_INTERVAL = int(os.getenv("POLLING_INTERVAL", "5")) # Default to 5 minutes if not specified + +logger.info(f"Supabase URL: {SUPABASE_URL}") +logger.info(f"Supabase Key: {SUPABASE_SERVICE_KEY[:5]}..." if SUPABASE_SERVICE_KEY else "Supabase Key: None") +logger.info(f"News API Key: {NEWS_API_KEY[:5]}..." if NEWS_API_KEY else "News API Key: None") +logger.info(f"Polling interval: {POLLING_INTERVAL} minutes") + +# Create Supabase client for database operations +supabase: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY) +logger.info("Supabase client initialized") + +def get_active_polling_stories(): + """ + Fetches all stories that have polling enabled + + Returns: + list: Stories with polling enabled, each containing id, user_id, keyword, and last_polled_at + """ + try: + logger.info("Fetching active polling stories") + result = supabase.table("tracked_stories") \ + .select("id, user_id, keyword, last_polled_at") \ + .eq("is_polling", True) \ + .execute() + + stories = result.data if result.data else [] + logger.info(f"Found {len(stories)} stories with polling enabled") + return stories + except Exception as e: + logger.error(f"Error fetching polling stories: {str(e)}") + return [] + +def fetch_news_articles(keyword, since_date=None): + """ + Fetches news articles from the News API based on a keyword + + Args: + keyword (str): The search term to find relevant articles + since_date (str, optional): ISO format date to fetch articles published since then + + Returns: + list: A list of article dictionaries + """ + try: + logger.info(f"Fetching news articles for keyword: '{keyword}'") + + # Configure the News API endpoint and request parameters + url = "https://newsapi.org/v2/everything" + params = { + 'q': keyword, + 'apiKey': NEWS_API_KEY, + 'pageSize': 1, # Limit results to avoid rate limiting + 'language': 'en', # English articles only + 'sortBy': 'publishedAt' # Get newest articles first + } + + # If we have a since_date, add it to the parameters + if since_date: + # Format date for News API (YYYY-MM-DD) + if isinstance(since_date, str): + try: + dt = datetime.datetime.fromisoformat(since_date.replace('Z', '+00:00')) + from_date = dt.strftime('%Y-%m-%d') + params['from'] = from_date + except ValueError: + logger.warning(f"Invalid date format: {since_date}, skipping date filter") + + logger.info(f"Requesting articles with params: {params}") + response = requests.get(url, params=params) + response.raise_for_status() + + news_data = response.json() + if news_data.get('status') == 'ok': + articles = news_data.get('articles', []) + logger.info(f"Received {len(articles)} articles from News API") + return articles + else: + logger.error(f"News API error: {news_data.get('message')}") + return [] + + except requests.exceptions.RequestException as e: + logger.error(f"Error fetching news: {str(e)}") + return [] + +def store_article(article): + """ + Stores an article in the news_articles table if it doesn't exist + + Args: + article (dict): Article data from News API + + Returns: + str: ID of the article in the database, or None if storage failed + """ + try: + # Check if article with same URL already exists + url = article.get('url') + if not url: + logger.warning("Article missing URL, skipping") + return None + + logger.info(f"Checking if article exists: {url}") + result = supabase.table("news_articles") \ + .select("id") \ + .eq("url", url) \ + .execute() + + if result.data and len(result.data) > 0: + logger.info(f"Article already exists with ID: {result.data[0]['id']}") + return result.data[0]['id'] + + # Prepare article data + source = article.get('source', {}).get('name', 'Unknown Source') + publish_date = article.get('publishedAt', datetime.datetime.utcnow().isoformat()) + + new_article = { + "title": article.get('title', 'No Title'), + "content": article.get('content', article.get('description', 'No Content')), + "summary": article.get('description', 'No Summary'), + "source": source, + "url": url, + "urlToImage": article.get('urlToImage', ''), + "author": article.get('author', 'Unknown'), + "publishedAt": publish_date + } + + # Insert article into news_articles table + logger.info(f"Storing new article: {new_article['title'][:30]}...") + result = supabase.table("news_articles").insert(new_article).execute() + + if result.data and len(result.data) > 0: + article_id = result.data[0]['id'] + logger.info(f"Article stored with ID: {article_id}") + return article_id + else: + logger.error("Failed to store article") + return None + + except Exception as e: + logger.error(f"Error storing article: {str(e)}") + return None + +def link_article_to_story(story_id, article_id): + """ + Links an article to a tracked story in the tracked_story_articles table + + Args: + story_id (str): ID of the tracked story + article_id (str): ID of the article + + Returns: + bool: True if linking was successful, False otherwise + """ + try: + # Check if link already exists + result = supabase.table("tracked_story_articles") \ + .select("*") \ + .eq("tracked_story_id", story_id) \ + .eq("news_id", article_id) \ + .execute() + + if result.data and len(result.data) > 0: + logger.info(f"Article {article_id} already linked to story {story_id}") + return True + + # Create new link + logger.info(f"Linking article {article_id} to story {story_id}") + result = supabase.table("tracked_story_articles").insert({ + "tracked_story_id": story_id, + "news_id": article_id, + "added_at": datetime.datetime.utcnow().isoformat() + }).execute() + + if result.data and len(result.data) > 0: + logger.info("Article linked successfully") + return True + else: + logger.error("Failed to link article to story") + return False + + except Exception as e: + logger.error(f"Error linking article to story: {str(e)}") + return False + +def update_story_timestamps(story_id, has_new_articles=False): + """ + Updates the last_polled_at timestamp for a story and last_updated if new articles were found + + Args: + story_id (str): ID of the tracked story + has_new_articles (bool): Whether new articles were found + + Returns: + bool: True if update was successful, False otherwise + """ + try: + current_time = datetime.datetime.utcnow().isoformat() + update_data = { + "last_polled_at": current_time + } + + # Only update last_updated if new articles were found + if has_new_articles: + update_data["last_updated"] = current_time + + logger.info(f"Updating timestamps for story {story_id}") + result = supabase.table("tracked_stories") \ + .update(update_data) \ + .eq("id", story_id) \ + .execute() + + if result.data and len(result.data) > 0: + logger.info("Timestamps updated successfully") + return True + else: + logger.error("Failed to update timestamps") + return False + + except Exception as e: + logger.error(f"Error updating timestamps: {str(e)}") + return False + +def poll_story(story): + """ + Polls for new articles for a specific story + + Args: + story (dict): Story object with id, keyword and last_polled_at + + Returns: + int: Number of new articles found + """ + try: + story_id = story["id"] + keyword = story["keyword"] + last_polled_at = story.get("last_polled_at") + + logger.info(f"Polling story {story_id} with keyword: '{keyword}'") + + # Fetch articles from News API + articles = fetch_news_articles(keyword, last_polled_at) + + if not articles: + logger.info(f"No new articles found for keyword: '{keyword}'") + update_story_timestamps(story_id, False) + return 0 + + # Process each article and store it + new_articles_count = 0 + + for article in articles: + # Store article in news_articles table + article_id = store_article(article) + + if article_id: + # Link the article to the tracked story + success = link_article_to_story(story_id, article_id) + if success: + new_articles_count += 1 + + # Update the story timestamps + update_story_timestamps(story_id, new_articles_count > 0) + + logger.info(f"Poll complete for story {story_id}. Found {new_articles_count} new articles") + return new_articles_count + + except Exception as e: + logger.error(f"Error polling story {story.get('id', 'unknown')}: {str(e)}") + # Still try to update last_polled_at even if there was an error + try: + update_story_timestamps(story.get('id'), False) + except: + pass + return 0 + +def run_polling_cycle(): + """ + Main function to run a complete polling cycle for all active stories + """ + logger.info("Starting polling cycle") + start_time = time.time() + + try: + stories = get_active_polling_stories() + if not stories: + logger.info("No active polling stories found. Polling cycle complete.") + return + + total_new_articles = 0 + stories_updated = 0 + + for story in stories: + try: + # Skip stories polled very recently (within last minute) to avoid redundant polls + if story.get("last_polled_at"): + last_polled = datetime.datetime.fromisoformat(story["last_polled_at"].replace('Z', '+00:00')) + now = datetime.datetime.utcnow() + time_since_last_poll = (now - last_polled).total_seconds() / 60 # in minutes + + if time_since_last_poll < 1: # Less than 1 minute + logger.info(f"Skipping story {story['id']} - polled recently ({time_since_last_poll:.1f} minutes ago)") + continue + + new_articles = poll_story(story) + if new_articles > 0: + total_new_articles += new_articles + stories_updated += 1 + except Exception as e: + logger.error(f"Error processing story {story.get('id', 'unknown')}: {str(e)}") + # Continue with next story + + elapsed_time = time.time() - start_time + logger.info(f"Polling cycle complete. Updated {stories_updated} stories with {total_new_articles} new articles in {elapsed_time:.2f} seconds") + + except Exception as e: + logger.error(f"Error in polling cycle: {str(e)}") + +def start_scheduled_polling(): + """ + Starts the scheduler to run polling at regular intervals + """ + logger.info(f"Setting up scheduled polling every {POLLING_INTERVAL} minutes") + + # Run immediately when started + run_polling_cycle() + + # Schedule regular polling + schedule.every(POLLING_INTERVAL).minutes.do(run_polling_cycle) + + logger.info("Polling scheduler started") + while True: + schedule.run_pending() + time.sleep(1) + +if __name__ == "__main__": + logger.info("Polling worker starting up") + try: + start_scheduled_polling() + except KeyboardInterrupt: + logger.info("Polling worker shutting down") + except Exception as e: + logger.error(f"Unexpected error in polling worker: {str(e)}") \ No newline at end of file diff --git a/backend/microservices/processing_service.py b/backend/microservices/processing_service.py deleted file mode 100755 index b20b7e1..0000000 --- a/backend/microservices/processing_service.py +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/env python3 -""" -processing_service.py - Microservice for Data Processing -Processes raw data (cleaning, deduplication) and stores it. -""" - -def run_processing(): - # TODO: Implement processing logic - print("Processing service is running...") - -if __name__ == '__main__': - run_processing() diff --git a/backend/microservices/storage/__init__.py b/backend/microservices/storage/__init__.py new file mode 100644 index 0000000..b93ddb5 --- /dev/null +++ b/backend/microservices/storage/__init__.py @@ -0,0 +1,2 @@ +# Make the storage directory a proper Python package +# This allows importing modules from this directory \ No newline at end of file diff --git a/backend/microservices/storage/bookmark_service.py b/backend/microservices/storage/bookmark_service.py new file mode 100644 index 0000000..9321ff9 --- /dev/null +++ b/backend/microservices/storage/bookmark_service.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 +""" +Bookmark Service Module + +This module provides functions for managing user bookmarks in the Supabase database. +It handles creating, retrieving, and deleting bookmark relationships between users and articles. + +The module uses the Supabase client to interact with the following tables: +- user_bookmarks: Manages user article bookmarks +- news_articles: Retrieves article data for bookmarks + +Environment Variables Required: +- VITE_SUPABASE_URL: Supabase project URL +- VITE_SUPABASE_ANON_KEY: Supabase anonymous key for client operations +""" + +import os +import datetime +import logging +from supabase import create_client, Client +from dotenv import load_dotenv + +# Initialize logger +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +# Load environment variables from .env file +load_dotenv('../../../.env') + +# Initialize Supabase client with environment variables +SUPABASE_URL = os.getenv("VITE_SUPABASE_URL") +SUPABASE_SERVICE_KEY = os.getenv("VITE_SUPABASE_ANON_KEY") # Using anon key for server-side operations +supabase: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY) + +logger.info("Bookmark Service initialized with Supabase configuration") + +def add_bookmark(user_id, news_id): + """ + Adds a bookmark by inserting a record into the user_bookmarks table. + + This function creates a bookmark relationship between a user and a news article, + allowing users to save articles for later reading. + + Args: + user_id (str): The ID of the user adding the bookmark + news_id (str): The ID of the news article to bookmark + + Returns: + dict or None: The created bookmark record if successful, None otherwise + + Raises: + Exception: If there's an error during the database operation + """ + logger.info(f"Adding bookmark for user {user_id} to article {news_id}") + try: + # Insert a new bookmark record linking user to article + result = supabase.table("user_bookmarks").insert({ + "user_id": user_id, + "news_id": news_id, + }).execute() + + # Return the first data item if available, otherwise None + bookmark_id = result.data[0]["id"] if result.data else None + logger.info(f"Successfully added bookmark with ID: {bookmark_id}") + return result.data[0] if result.data else None + except Exception as e: + logger.error(f"Error adding bookmark: {str(e)}") + # Re-raise the exception for proper error handling upstream + raise e + +def get_user_bookmarks(user_id): + """ + Retrieves all bookmarked articles for a user with full article details. + + This function performs a join between the user_bookmarks table and the news_articles table + to retrieve complete article information for all articles bookmarked by the specified user. + The results are transformed into a more user-friendly format where each article includes its + bookmark_id for reference. + + Args: + user_id (str): The ID of the user whose bookmarks should be retrieved + + Returns: + list: A list of dictionaries, each containing the full details of a bookmarked article + with an additional 'bookmark_id' field + + Raises: + Exception: If there's an error during the database operation + """ + logger.info(f"Retrieving bookmarks for user {user_id}") + try: + # Query user_bookmarks and join with news_articles to get full article details + # This uses Supabase's foreign key relationships to perform the join + result = supabase.table("user_bookmarks") \ + .select( + "id," + "news_articles(id,title,summary,content,source,published_at,url,image)" + ) \ + .eq("user_id", user_id) \ + .execute() + + # Transform the nested result structure to a more friendly format + # by flattening the news_articles data and adding the bookmark_id + bookmarks = [] + for item in result.data: + article = item["news_articles"] + article["bookmark_id"] = item["id"] # Add bookmark ID to article for reference + bookmarks.append(article) + + logger.info(f"Retrieved {len(bookmarks)} bookmarks for user {user_id}") + return bookmarks + except Exception as e: + logger.error(f"Error fetching bookmarks: {str(e)}") + # Re-raise the exception for proper error handling upstream + raise e + +def delete_bookmark(user_id, bookmark_id): + """ + Deletes a bookmark from the user_bookmarks table. + + This function removes a bookmark relationship between a user and an article. + It ensures that users can only delete their own bookmarks by checking both the + bookmark_id and user_id in the query. + + Args: + user_id (str): The ID of the user who owns the bookmark + bookmark_id (str): The ID of the bookmark to delete + + Returns: + bool: True if the bookmark was successfully deleted, False if no bookmark was found + or if the deletion was unsuccessful + + Raises: + Exception: If there's an error during the database operation + """ + logger.info(f"Deleting bookmark {bookmark_id} for user {user_id}") + try: + # Delete the bookmark, ensuring it belongs to the specified user + # This double condition prevents users from deleting other users' bookmarks + result = supabase.table("user_bookmarks") \ + .delete() \ + .eq("id", bookmark_id) \ + .eq("user_id", user_id) \ + .execute() + + # Return True if at least one record was deleted, False otherwise + success = len(result.data) > 0 + logger.info(f"Bookmark deletion {'successful' if success else 'unsuccessful'}") + return success + except Exception as e: + logger.error(f"Error deleting bookmark: {str(e)}") + # Re-raise the exception for proper error handling upstream + raise e \ No newline at end of file diff --git a/backend/microservices/storage/search_logger.py b/backend/microservices/storage/search_logger.py new file mode 100644 index 0000000..2482072 --- /dev/null +++ b/backend/microservices/storage/search_logger.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +""" +Search Logger Module + +This module provides functionality for logging user search and article view events. +It records user interactions with news articles for analytics and personalization purposes. + +The module uses the Supabase client to interact with the following tables: +- user_search_history: Tracks user search and article view interactions + +Environment Variables Required: +- VITE_SUPABASE_URL: Supabase project URL +- VITE_SUPABASE_ANON_KEY: Supabase anonymous key for client operations +""" + +import os +import datetime +import logging +from supabase import create_client, Client +from dotenv import load_dotenv + +# Initialize logger +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +# Load environment variables from .env file +load_dotenv('../../../.env') + +# Initialize Supabase client with environment variables +SUPABASE_URL = os.getenv("VITE_SUPABASE_URL") +SUPABASE_SERVICE_KEY = os.getenv("VITE_SUPABASE_ANON_KEY") # Using anon key for server-side operations +supabase: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY) + +logger.info("Search Logger Service initialized with Supabase configuration") + +def log_user_search(user_id, news_id, session_id): + """ + Logs a search event by inserting a record into the user_search_history join table. + + This function creates a record of a user viewing or searching for a specific article, + which can be used for analytics, personalization, and tracking user activity across sessions. + + Args: + user_id (str): The ID of the user performing the search + news_id (str): The ID of the news article that was viewed/searched + session_id (str): The current session identifier for tracking user activity + + Returns: + dict: The Supabase response object containing the result of the insert operation + """ + logger.info(f"Logging search event for user {user_id}, article {news_id}, session {session_id}") + try: + # Create a timestamp for when the search occurred + current_time = datetime.datetime.utcnow().isoformat() + + # Insert the search record with all required fields + result = supabase.table("user_search_history").insert({ + "user_id": user_id, + "news_id": news_id, + "searched_at": current_time, + "session_id": session_id, + }).execute() + logger.debug(f"Search event logged successfully") + return result + except Exception as e: + logger.error(f"Error logging search event: {str(e)}") + raise e \ No newline at end of file diff --git a/backend/microservices/story_tracking/__init__.py b/backend/microservices/story_tracking/__init__.py new file mode 100644 index 0000000..3f4d4a5 --- /dev/null +++ b/backend/microservices/story_tracking/__init__.py @@ -0,0 +1 @@ +# This file marks the directory as a Python package \ No newline at end of file diff --git a/backend/microservices/story_tracking/article_matcher.py b/backend/microservices/story_tracking/article_matcher.py new file mode 100644 index 0000000..34e16a5 --- /dev/null +++ b/backend/microservices/story_tracking/article_matcher.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +""" +Article Matcher Module + +This module provides functionality for finding and matching articles related to tracked stories. +It integrates with the news fetcher service to find relevant articles based on keywords. +""" + +import datetime +import logging +from supabase import create_client, Client +import os +from dotenv import load_dotenv +from backend.microservices.news_fetcher import fetch_news +from backend.microservices.news_storage import store_article_in_supabase + +# Initialize logger +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +# Load environment variables from .env file +load_dotenv() + +# Initialize Supabase client with service role key for admin access to bypass RLS +SUPABASE_URL = os.getenv("VITE_SUPABASE_URL") +SUPABASE_SERVICE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY") + +# Create Supabase client for database operations +supabase: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY) + +logger.info("Article Matcher Service initialized with Supabase configuration") + +def find_related_articles(story_id, keyword): + """ + Finds and adds articles related to a tracked story based on its keyword. + + Args: + story_id: The ID of the tracked story + keyword: The keyword to search for + + Returns: + Number of new articles added + """ + logger.info(f"Finding related articles for story {story_id}, keyword: '{keyword}'") + try: + # Get the tracked story to check when it was last updated + story_result = supabase.table("tracked_stories") \ + .select("*") \ + .eq("id", story_id) \ + .execute() + + if not story_result.data or len(story_result.data) == 0: + logger.warning(f"No story found with ID {story_id}") + return 0 + + story = story_result.data[0] + logger.debug(f"Found story: {story['keyword']}") + + # Fetch articles related to the keyword + logger.info(f"Fetching articles for keyword '{keyword}'") + articles = fetch_news(keyword) + + if not articles: + logger.info(f"No articles found for keyword '{keyword}'") + return 0 + + logger.info(f"Found {len(articles)} articles for keyword '{keyword}'") + + # Get existing article IDs for this story to avoid duplicates + logger.debug(f"Getting existing article IDs for story {story_id}") + existing_result = supabase.table("tracked_story_articles") \ + .select("news_id") \ + .eq("tracked_story_id", story_id) \ + .execute() + + existing_ids = [item["news_id"] for item in existing_result.data] if existing_result.data else [] + logger.debug(f"Found {len(existing_ids)} existing article IDs") + + # Process and add new articles + new_articles_count = 0 + for article in articles: + # First, store the article in the news_articles table + logger.debug(f"Storing article: {article.get('title', 'No title')}") + article_id = store_article_in_supabase(article) + logger.debug(f"Article stored with ID: {article_id}") + + # If this article is not already linked to the story, add it + if article_id not in existing_ids: + logger.debug(f"Linking new article {article_id} to story {story_id}") + supabase.table("tracked_story_articles").insert({ + "tracked_story_id": story_id, + "news_id": article_id, + "added_at": datetime.datetime.utcnow().isoformat() + }).execute() + new_articles_count += 1 + else: + logger.debug(f"Article {article_id} already linked to story") + + logger.info(f"Added {new_articles_count} new articles to story {story_id}") + + # Update the last_updated timestamp of the tracked story + if new_articles_count > 0: + logger.debug(f"Updating last_updated timestamp for story {story_id}") + supabase.table("tracked_stories") \ + .update({"last_updated": datetime.datetime.utcnow().isoformat()}) \ + .eq("id", story_id) \ + .execute() + + return new_articles_count + + except Exception as e: + logger.error(f"Error finding related articles: {str(e)}") + raise e \ No newline at end of file diff --git a/backend/microservices/story_tracking/article_retriever.py b/backend/microservices/story_tracking/article_retriever.py new file mode 100644 index 0000000..533739b --- /dev/null +++ b/backend/microservices/story_tracking/article_retriever.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +""" +Article Retriever Module + +This module provides functionality for retrieving articles related to tracked stories. +It handles the fetching of article data from the database and manages the relationship +between tracked stories and their associated articles. +""" + +import datetime +import logging +from supabase import create_client, Client +import os +from dotenv import load_dotenv + +# Initialize logger +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +# Load environment variables from .env file +load_dotenv() + +# Initialize Supabase client with service role key for admin access to bypass RLS +SUPABASE_URL = os.getenv("VITE_SUPABASE_URL") +SUPABASE_SERVICE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY") + +# Create Supabase client for database operations +supabase: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY) + +logger.info("Article Retriever Service initialized with Supabase configuration") + +def get_story_articles(story_id): + """ + Gets all articles related to a tracked story. + + Args: + story_id: The ID of the tracked story + + Returns: + List of articles related to the tracked story + """ + logger.info(f"Getting articles for story {story_id}") + try: + # Get all article IDs related to the tracked story + result = supabase.table("tracked_story_articles") \ + .select("news_id, added_at") \ + .eq("tracked_story_id", story_id) \ + .order("added_at", desc=True) \ + .execute() + + article_refs = result.data if result.data else [] + logger.info(f"Found {len(article_refs)} article references") + + if not article_refs: + return [] + + # Get the full article details for each article ID + articles = [] + for ref in article_refs: + logger.debug(f"Getting details for article {ref['news_id']}") + article_result = supabase.table("news_articles") \ + .select("*") \ + .eq("id", ref["news_id"]) \ + .execute() + + if article_result.data and len(article_result.data) > 0: + article = article_result.data[0] + # Add the added_at timestamp from the join table + article["added_at"] = ref["added_at"] + articles.append(article) + logger.debug(f"Added article: {article.get('title', 'No title')}") + else: + logger.warning(f"No data found for article {ref['news_id']}") + + return articles + + except Exception as e: + logger.error(f"Error getting story articles: {str(e)}") + raise e \ No newline at end of file diff --git a/backend/microservices/story_tracking/polling_service.py b/backend/microservices/story_tracking/polling_service.py new file mode 100644 index 0000000..254158a --- /dev/null +++ b/backend/microservices/story_tracking/polling_service.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 +""" +Polling Service Module + +This module provides functionality for managing polling of tracked stories. +It handles enabling/disabling polling for stories and updating stories with new articles. +""" + +import datetime +import logging +from supabase import create_client, Client +import os +from dotenv import load_dotenv +from backend.microservices.story_tracking.article_matcher import find_related_articles + +# Initialize logger +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +# Load environment variables from .env file +load_dotenv() + +# Initialize Supabase client with service role key for admin access to bypass RLS +SUPABASE_URL = os.getenv("VITE_SUPABASE_URL") +SUPABASE_SERVICE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY") + +# Create Supabase client for database operations +supabase: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY) + +logger.info("Polling Service initialized with Supabase configuration") + +def toggle_polling(user_id, story_id, enable=True): + """ + Enables or disables polling for a tracked story. + + Args: + user_id: The ID of the user + story_id: The ID of the tracked story + enable: True to enable polling, False to disable + + Returns: + The updated tracked story record, or None if the story wasn't found + """ + logger.info(f"{'Enabling' if enable else 'Disabling'} polling for story {story_id}, user {user_id}") + try: + # Verify that the story belongs to the user + story_result = supabase.table("tracked_stories") \ + .select("*") \ + .eq("id", story_id) \ + .eq("user_id", user_id) \ + .execute() + + if not story_result.data or len(story_result.data) == 0: + logger.warning(f"No story found with ID {story_id} for user {user_id}") + return None + + current_time = datetime.datetime.utcnow().isoformat() + + # Update the story's polling status + update_data = { + "is_polling": enable + } + + # If enabling polling, also set the last_polled_at timestamp + if enable: + update_data["last_polled_at"] = current_time + + result = supabase.table("tracked_stories") \ + .update(update_data) \ + .eq("id", story_id) \ + .eq("user_id", user_id) \ + .execute() + + if not result.data or len(result.data) == 0: + logger.error(f"Failed to update polling status for story {story_id}") + return None + + updated_story = result.data[0] + logger.info(f"Successfully {'enabled' if enable else 'disabled'} polling for story {story_id}") + + # If polling was enabled, fetch articles immediately + if enable: + logger.debug(f"Performing initial article fetch for newly enabled polling") + find_related_articles(story_id, updated_story["keyword"]) + + return updated_story + + except Exception as e: + logger.error(f"Error toggling polling status: {str(e)}") + raise e + +def get_polling_stories(): + """ + Gets all tracked stories that have polling enabled. + + This function is intended to be called by the polling worker to fetch + all stories that need to be checked for updates. + + Returns: + List of tracked stories with polling enabled + """ + logger.info("Getting all stories with polling enabled") + try: + result = supabase.table("tracked_stories") \ + .select("*") \ + .eq("is_polling", True) \ + .execute() + + stories = result.data if result.data else [] + logger.info(f"Found {len(stories)} stories with polling enabled") + return stories + + except Exception as e: + logger.error(f"Error getting polling stories: {str(e)}") + raise e + +def update_polling_timestamp(story_id): + """ + Updates the last_polled_at timestamp for a tracked story. + + This function is intended to be called after polling for new articles + for a story, whether or not new articles were found. + + Args: + story_id: The ID of the tracked story + + Returns: + True if successful, False otherwise + """ + logger.info(f"Updating polling timestamp for story {story_id}") + try: + current_time = datetime.datetime.utcnow().isoformat() + + result = supabase.table("tracked_stories") \ + .update({"last_polled_at": current_time}) \ + .eq("id", story_id) \ + .execute() + + success = result.data and len(result.data) > 0 + logger.info(f"Update {'successful' if success else 'failed'}") + return success + + except Exception as e: + logger.error(f"Error updating polling timestamp: {str(e)}") + return False + +def update_polling_stories(): + """ + Update all tracked stories with polling enabled. + + This function is similar to update_all_tracked_stories() but focuses only + on stories with polling enabled. It's intended to be called by the + polling worker to periodically fetch new articles for active stories. + + Returns: + dict: A dictionary containing statistics about the update operation: + - stories_updated: Number of stories that received new articles + - new_articles: Total number of new articles added across all stories + """ + logger.info("Starting update of polling-enabled stories") + try: + # Get all stories with polling enabled + stories = get_polling_stories() + + if not stories: + logger.info("No polling-enabled stories found") + return {"stories_updated": 0, "new_articles": 0} + + # Update each story + stories_updated = 0 + total_new_articles = 0 + + for story in stories: + story_id = story["id"] + keyword = story["keyword"] + logger.debug(f"Polling story {story_id}, keyword: '{keyword}'") + + # Find new articles for this story + new_articles = find_related_articles(story_id, keyword) + + # Always update the last_polled_at timestamp, even if no new articles were found + update_polling_timestamp(story_id) + + if new_articles > 0: + stories_updated += 1 + total_new_articles += new_articles + logger.debug(f"Added {new_articles} new articles to story {story_id}") + else: + logger.debug(f"No new articles found for story {story_id}") + + logger.info(f"Update complete. Updated {stories_updated} stories with {total_new_articles} new articles") + return { + "stories_updated": stories_updated, + "new_articles": total_new_articles + } + + except Exception as e: + logger.error(f"Error updating polling stories: {str(e)}") + raise e \ No newline at end of file diff --git a/backend/microservices/story_tracking/story_manager.py b/backend/microservices/story_tracking/story_manager.py new file mode 100644 index 0000000..64d5b06 --- /dev/null +++ b/backend/microservices/story_tracking/story_manager.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python3 +""" +Story Manager Module + +This module provides functionality for managing tracked stories, including: +- Creating new tracked stories +- Retrieving tracked stories for a user +- Getting details for a specific story +- Deleting tracked stories + +It integrates with Supabase for data persistence and handles the core story management operations. +""" + +import datetime +import logging +from supabase import create_client, Client +import os +from dotenv import load_dotenv +from backend.microservices.story_tracking.article_retriever import get_story_articles +from backend.microservices.story_tracking.article_matcher import find_related_articles + +# Initialize logger +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +# Load environment variables from .env file +load_dotenv() + +# Initialize Supabase client with service role key for admin access to bypass RLS +SUPABASE_URL = os.getenv("VITE_SUPABASE_URL") +SUPABASE_SERVICE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY") + +# Create Supabase client for database operations +supabase: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY) + +logger.info("Story Manager Service initialized with Supabase configuration") + +def create_tracked_story(user_id, keyword, source_article_id=None, enable_polling=False): + """ + Creates a new tracked story for a user based on a keyword. + + Args: + user_id: The ID of the user tracking the story + keyword: The keyword/topic to track + source_article_id: Optional ID of the source article that initiated tracking + enable_polling: Whether to enable automatic polling for this story + + Returns: + The created tracked story record + """ + + logger.info(f"Creating tracked story for user {user_id}, keyword: '{keyword}', source_article: {source_article_id}, polling: {enable_polling}") + try: + # Check if the user is already tracking this keyword + logger.debug(f"Checking if user already tracks keyword '{keyword}'") + existing = supabase.table("tracked_stories") \ + .select("*") \ + .eq("user_id", user_id) \ + .eq("keyword", keyword) \ + .execute() + + if existing.data and len(existing.data) > 0: + # User is already tracking this keyword + logger.info(f"User already tracking this keyword, found {len(existing.data)} existing entries") + return existing.data[0] + + # Create a new tracked story + logger.debug("Creating new tracked story record") + current_time = datetime.datetime.utcnow().isoformat() + result = supabase.table("tracked_stories").insert({ + "user_id": user_id, + "keyword": keyword, + "created_at": current_time, + "last_updated": current_time, + "is_polling": enable_polling, + "last_polled_at": current_time if enable_polling else None + }).execute() + + if not result.data: + logger.error(f"Failed to create tracked story: {result}") + return None + + tracked_story = result.data[0] if result.data else None + logger.info(f"Tracked story created with ID: {tracked_story['id'] if tracked_story else None}") + + # If a source article was provided, link it to the tracked story + if tracked_story and source_article_id: + logger.debug(f"Linking source article {source_article_id} to tracked story") + supabase.table("tracked_story_articles").insert({ + "tracked_story_id": tracked_story["id"], + "news_id": source_article_id, + "added_at": datetime.datetime.utcnow().isoformat() + }).execute() + + # Log that we're skipping synchronous article fetching + logger.debug("Skipping synchronous article fetching to avoid resource contention") + find_related_articles(tracked_story["id"], keyword) + + return tracked_story + + except Exception as e: + logger.error(f"Error creating tracked story: {str(e)}") + raise e + +def get_tracked_stories(user_id): + """ + Gets all tracked stories for a user. + + Args: + user_id: The ID of the user + + Returns: + List of tracked stories with their related articles + """ + logger.info(f"Getting tracked stories for user {user_id}") + try: + # Get all tracked stories for the user + result = supabase.table("tracked_stories") \ + .select("*") \ + .eq("user_id", user_id) \ + .order("created_at", desc=True) \ + .execute() + + tracked_stories = result.data if result.data else [] + logger.info(f"Found {len(tracked_stories)} tracked stories") + + # For each tracked story, get its related articles + for story in tracked_stories: + logger.debug(f"Getting articles for story {story['id']}") + story["articles"] = get_story_articles(story["id"]) + logger.debug(f"Found {len(story['articles'])} articles for story {story['id']}") + + return tracked_stories + + except Exception as e: + logger.error(f"Error getting tracked stories: {str(e)}") + raise e + +def get_story_details(story_id): + """ + Gets details for a specific tracked story including related articles. + + Args: + story_id: The ID of the tracked story + + Returns: + The tracked story with its related articles + """ + logger.info(f"Getting story details for story ID {story_id}") + try: + # Get the tracked story + result = supabase.table("tracked_stories") \ + .select("*") \ + .eq("id", story_id) \ + .execute() + + if not result.data or len(result.data) == 0: + logger.warning(f"No story found with ID {story_id}") + return None + + story = result.data[0] + logger.debug(f"Found story: {story['keyword']}") + + # Get related articles + logger.debug("Getting related articles") + story["articles"] = get_story_articles(story_id) + logger.info(f"Found {len(story['articles'])} related articles") + + return story + + except Exception as e: + logger.error(f"Error getting story details: {str(e)}") + raise e + +def delete_tracked_story(user_id, story_id): + """ + Deletes a tracked story for a user. + + Args: + user_id: The ID of the user + story_id: The ID of the tracked story to delete + + Returns: + True if successful, False otherwise + """ + logger.info(f"Deleting tracked story {story_id} for user {user_id}") + try: + # Delete the tracked story (related articles will be deleted via CASCADE) + result = supabase.table("tracked_stories") \ + .delete() \ + .eq("id", story_id) \ + .eq("user_id", user_id) \ + .execute() + + success = len(result.data) > 0 + logger.info(f"Delete operation {'successful' if success else 'failed'}") + return success + + except Exception as e: + logger.error(f"Error deleting tracked story: {str(e)}") + raise e + +def update_all_tracked_stories(): + """ + Background job to update all tracked stories with new related articles. + + This function is designed to be run as a scheduled task to keep all tracked stories + up-to-date with the latest news articles. It iterates through all tracked stories in the + database and calls find_related_articles() for each one to fetch and link new articles. + + Returns: + dict: A dictionary containing statistics about the update operation: + - stories_updated: Number of stories that received new articles + - new_articles: Total number of new articles added across all stories + """ + logger.info("Starting update of all tracked stories") + try: + # Get all tracked stories + result = supabase.table("tracked_stories") \ + .select("id, keyword") \ + .execute() + + tracked_stories = result.data if result.data else [] + logger.info(f"Found {len(tracked_stories)} tracked stories to update") + + if not tracked_stories: + return {"stories_updated": 0, "new_articles": 0} + + # Update each tracked story + stories_updated = 0 + total_new_articles = 0 + + for story in tracked_stories: + logger.debug(f"Updating story {story['id']}, keyword: '{story['keyword']}'") + new_articles = find_related_articles(story["id"], story["keyword"]) + if new_articles > 0: + stories_updated += 1 + total_new_articles += new_articles + logger.debug(f"Added {new_articles} new articles to story {story['id']}") + else: + logger.debug(f"No new articles found for story {story['id']}") + + logger.info(f"Update complete. Updated {stories_updated} stories with {total_new_articles} new articles") + return { + "stories_updated": stories_updated, + "new_articles": total_new_articles + } + + except Exception as e: + logger.error(f"Error updating tracked stories: {str(e)}") + raise e \ No newline at end of file diff --git a/backend/microservices/story_tracking_service.py b/backend/microservices/story_tracking_service.py index 89b4975..963c29b 100755 --- a/backend/microservices/story_tracking_service.py +++ b/backend/microservices/story_tracking_service.py @@ -10,6 +10,7 @@ - Related article discovery - User story management - Automatic story updates +- Polling for new articles The service uses clustering algorithms to group similar articles and maintains relationships between tracked stories and their associated articles. @@ -33,6 +34,23 @@ # from summarization.story_tracking.story_tracking import cluster_articles from backend.microservices.news_fetcher import fetch_news +# Import the refactored modules +from backend.microservices.story_tracking.article_matcher import find_related_articles +from backend.microservices.story_tracking.polling_service import ( + toggle_polling, + get_polling_stories, + update_polling_timestamp, + update_polling_stories +) +from backend.microservices.story_tracking.story_manager import ( + create_tracked_story, + get_tracked_stories, + get_story_details, + delete_tracked_story, + update_all_tracked_stories +) +from backend.microservices.story_tracking.article_retriever import get_story_articles + # Service initialization logging print("[DEBUG] [story_tracking_service] [main] Story tracking service starting...") @@ -69,353 +87,14 @@ def run_story_tracking(article_embeddings): Empty list is returned if article_embeddings is None or empty. """ print(f"[DEBUG] [story_tracking_service] [run_story_tracking] Running story tracking with {len(article_embeddings) if article_embeddings else 0} embeddings") - labels = cluster_articles(article_embeddings) - print(f"[DEBUG] [story_tracking_service] [run_story_tracking] Clustering complete, found {len(labels) if labels else 0} labels") - return labels - -def create_tracked_story(user_id, keyword, source_article_id=None): - """ - Creates a new tracked story for a user based on a keyword. - - Args: - user_id: The ID of the user tracking the story - keyword: The keyword/topic to track - source_article_id: Optional ID of the source article that initiated tracking - - Returns: - The created tracked story record - """ - - print(f"[DEBUG] [story_tracking_service] [create_tracked_story] Creating tracked story for user {user_id}, keyword: '{keyword}', source_article: {source_article_id}") - try: - # Check if the user is already tracking this keyword - print(f"[DEBUG] [story_tracking_service] [create_tracked_story] Checking if user already tracks keyword '{keyword}'") - existing = supabase.table("tracked_stories") \ - .select("*") \ - .eq("user_id", user_id) \ - .eq("keyword", keyword) \ - .execute() - - if existing.data and len(existing.data) > 0: - # User is already tracking this keyword - print(f"[DEBUG] [story_tracking_service] [create_tracked_story] User already tracking this keyword, found {len(existing.data)} existing entries") - return existing.data[0] - - # Create a new tracked story - print(f"[DEBUG] [story_tracking_service] [create_tracked_story] Creating new tracked story record") - current_time = datetime.datetime.utcnow().isoformat() - result = supabase.table("tracked_stories").insert({ - "user_id": user_id, - "keyword": keyword, - "created_at": current_time, - "last_updated": current_time - }).execute() - - if not result.data: - print(f"[DEBUG] [story_tracking_service] [create_tracked_story] Failed to create tracked story: {result}") - return None - - tracked_story = result.data[0] if result.data else None - print(f"[DEBUG] [story_tracking_service] [create_tracked_story] Tracked story created with ID: {tracked_story['id'] if tracked_story else None}") - - # If a source article was provided, link it to the tracked story - if tracked_story and source_article_id: - print(f"[DEBUG] [story_tracking_service] [create_tracked_story] Linking source article {source_article_id} to tracked story") - supabase.table("tracked_story_articles").insert({ - "tracked_story_id": tracked_story["id"], - "news_id": source_article_id, - "added_at": datetime.datetime.utcnow().isoformat() - }).execute() - - # Log that we're skipping synchronous article fetching - print(f"[DEBUG] [story_tracking_service] [create_tracked_story] Skipping synchronous article fetching to avoid resource contention") - find_related_articles(tracked_story["id"], keyword) - - return tracked_story - - except Exception as e: - print(f"[DEBUG] [story_tracking_service] [create_tracked_story] Error creating tracked story: {str(e)}") - raise e - -def get_tracked_stories(user_id): - """ - Gets all tracked stories for a user. - - Args: - user_id: The ID of the user - - Returns: - List of tracked stories with their related articles - """ - print(f"[DEBUG] [story_tracking_service] [get_tracked_stories] Getting tracked stories for user {user_id}") - try: - # Get all tracked stories for the user - result = supabase.table("tracked_stories") \ - .select("*") \ - .eq("user_id", user_id) \ - .order("created_at", desc=True) \ - .execute() - - tracked_stories = result.data if result.data else [] - print(f"[DEBUG] [story_tracking_service] [get_tracked_stories] Found {len(tracked_stories)} tracked stories") - - # For each tracked story, get its related articles - for story in tracked_stories: - print(f"[DEBUG] [story_tracking_service] [get_tracked_stories] Getting articles for story {story['id']}") - story["articles"] = get_story_articles(story["id"]) - print(f"[DEBUG] [story_tracking_service] [get_tracked_stories] Found {len(story['articles'])} articles for story {story['id']}") - - return tracked_stories - - except Exception as e: - print(f"[DEBUG] [story_tracking_service] [get_tracked_stories] Error getting tracked stories: {str(e)}") - raise e - -def get_story_details(story_id): - """ - Gets details for a specific tracked story including related articles. - - Args: - story_id: The ID of the tracked story - - Returns: - The tracked story with its related articles - """ - print(f"[DEBUG] [story_tracking_service] [get_story_details] Getting story details for story ID {story_id}") - try: - # Get the tracked story - result = supabase.table("tracked_stories") \ - .select("*") \ - .eq("id", story_id) \ - .execute() - - if not result.data or len(result.data) == 0: - print(f"[DEBUG] [story_tracking_service] [get_story_details] No story found with ID {story_id}") - return None - - story = result.data[0] - print(f"[DEBUG] [story_tracking_service] [get_story_details] Found story: {story['keyword']}") - - # Get related articles - print(f"[DEBUG] [story_tracking_service] [get_story_details] Getting related articles") - story["articles"] = get_story_articles(story_id) - print(f"[DEBUG] [story_tracking_service] [get_story_details] Found {len(story['articles'])} related articles") - - return story - - except Exception as e: - print(f"[DEBUG] [story_tracking_service] [get_story_details] Error getting story details: {str(e)}") - raise e - -def delete_tracked_story(user_id, story_id): - """ - Deletes a tracked story for a user. - - Args: - user_id: The ID of the user - story_id: The ID of the tracked story to delete - - Returns: - True if successful, False otherwise - """ - print(f"[DEBUG] [story_tracking_service] [delete_tracked_story] Deleting tracked story {story_id} for user {user_id}") - try: - # Delete the tracked story (related articles will be deleted via CASCADE) - result = supabase.table("tracked_stories") \ - .delete() \ - .eq("id", story_id) \ - .eq("user_id", user_id) \ - .execute() - - success = len(result.data) > 0 - print(f"[DEBUG] [story_tracking_service] [delete_tracked_story] Delete operation {'successful' if success else 'failed'}") - return success - - except Exception as e: - print(f"[DEBUG] [story_tracking_service] [delete_tracked_story] Error deleting tracked story: {str(e)}") - raise e - -def get_story_articles(story_id): - """ - Gets all articles related to a tracked story. - - Args: - story_id: The ID of the tracked story - - Returns: - List of articles related to the tracked story - """ - print(f"[DEBUG] [story_tracking_service] [get_story_articles] Getting articles for story {story_id}") - try: - # Get all article IDs related to the tracked story - result = supabase.table("tracked_story_articles") \ - .select("news_id, added_at") \ - .eq("tracked_story_id", story_id) \ - .order("added_at", desc=True) \ - .execute() - - article_refs = result.data if result.data else [] - print(f"[DEBUG] [story_tracking_service] [get_story_articles] Found {len(article_refs)} article references") - - if not article_refs: - return [] - - # Get the full article details for each article ID - articles = [] - for ref in article_refs: - print(f"[DEBUG] [story_tracking_service] [get_story_articles] Getting details for article {ref['news_id']}") - article_result = supabase.table("news_articles") \ - .select("*") \ - .eq("id", ref["news_id"]) \ - .execute() - - if article_result.data and len(article_result.data) > 0: - article = article_result.data[0] - # Add the added_at timestamp from the join table - article["added_at"] = ref["added_at"] - articles.append(article) - print(f"[DEBUG] [story_tracking_service] [get_story_articles] Added article: {article.get('title', 'No title')}") - else: - print(f"[DEBUG] [story_tracking_service] [get_story_articles] No data found for article {ref['news_id']}") - - return articles - - except Exception as e: - print(f"[DEBUG] [story_tracking_service] [get_story_articles] Error getting story articles: {str(e)}") - raise e - -def find_related_articles(story_id, keyword): - """ - Finds and adds articles related to a tracked story based on its keyword. - - Args: - story_id: The ID of the tracked story - keyword: The keyword to search for - - Returns: - Number of new articles added - """ - print(f"[DEBUG] [story_tracking_service] [find_related_articles] Finding related articles for story {story_id}, keyword: '{keyword}'") - try: - # Get the tracked story to check when it was last updated - story_result = supabase.table("tracked_stories") \ - .select("*") \ - .eq("id", story_id) \ - .execute() - - if not story_result.data or len(story_result.data) == 0: - print(f"[DEBUG] [story_tracking_service] [find_related_articles] No story found with ID {story_id}") - return 0 - - story = story_result.data[0] - print(f"[DEBUG] [story_tracking_service] [find_related_articles] Found story: {story['keyword']}") - - # Fetch articles related to the keyword - print(f"[DEBUG] [story_tracking_service] [find_related_articles] Fetching articles for keyword '{keyword}'") - articles = fetch_news(keyword) - - if not articles: - print(f"[DEBUG] [story_tracking_service] [find_related_articles] No articles found for keyword '{keyword}'") - return 0 - - print(f"[DEBUG] [story_tracking_service] [find_related_articles] Found {len(articles)} articles for keyword '{keyword}'") - - # Get existing article IDs for this story to avoid duplicates - print(f"[DEBUG] [story_tracking_service] [find_related_articles] Getting existing article IDs for story {story_id}") - existing_result = supabase.table("tracked_story_articles") \ - .select("news_id") \ - .eq("tracked_story_id", story_id) \ - .execute() - - existing_ids = [item["news_id"] for item in existing_result.data] if existing_result.data else [] - print(f"[DEBUG] [story_tracking_service] [find_related_articles] Found {len(existing_ids)} existing article IDs") - - # Process and add new articles - new_articles_count = 0 - for article in articles: - # First, store the article in the news_articles table - print(f"[DEBUG] [story_tracking_service] [find_related_articles] Storing article: {article.get('title', 'No title')}") - from backend.microservices.news_storage import store_article_in_supabase - article_id = store_article_in_supabase(article) - print(f"[DEBUG] [story_tracking_service] [find_related_articles] Article stored with ID: {article_id}") - - # If this article is not already linked to the story, add it - if article_id not in existing_ids: - print(f"[DEBUG] [story_tracking_service] [find_related_articles] Linking new article {article_id} to story {story_id}") - supabase.table("tracked_story_articles").insert({ - "tracked_story_id": story_id, - "news_id": article_id, - "added_at": datetime.datetime.utcnow().isoformat() - }).execute() - new_articles_count += 1 - else: - print(f"[DEBUG] [story_tracking_service] [find_related_articles] Article {article_id} already linked to story") - - print(f"[DEBUG] [story_tracking_service] [find_related_articles] Added {new_articles_count} new articles to story {story_id}") - - # Update the last_updated timestamp of the tracked story - if new_articles_count > 0: - print(f"[DEBUG] [story_tracking_service] [find_related_articles] Updating last_updated timestamp for story {story_id}") - supabase.table("tracked_stories") \ - .update({"last_updated": datetime.datetime.utcnow().isoformat()}) \ - .eq("id", story_id) \ - .execute() - - return new_articles_count - - except Exception as e: - print(f"[DEBUG] [story_tracking_service] [find_related_articles] Error finding related articles: {str(e)}") - raise e - -def update_all_tracked_stories(): - """ - Background job to update all tracked stories with new related articles. - - This function is designed to be run as a scheduled task to keep all tracked stories - up-to-date with the latest news articles. It iterates through all tracked stories in the - database and calls find_related_articles() for each one to fetch and link new articles. - - Returns: - dict: A dictionary containing statistics about the update operation: - - stories_updated: Number of stories that received new articles - - new_articles: Total number of new articles added across all stories - """ - print(f"[DEBUG] [story_tracking_service] [update_all_tracked_stories] Starting update of all tracked stories") - try: - # Get all tracked stories - result = supabase.table("tracked_stories") \ - .select("id, keyword") \ - .execute() - - tracked_stories = result.data if result.data else [] - print(f"[DEBUG] [story_tracking_service] [update_all_tracked_stories] Found {len(tracked_stories)} tracked stories to update") - - if not tracked_stories: - return {"stories_updated": 0, "new_articles": 0} - - # Update each tracked story - stories_updated = 0 - total_new_articles = 0 - - for story in tracked_stories: - print(f"[DEBUG] [story_tracking_service] [update_all_tracked_stories] Updating story {story['id']}, keyword: '{story['keyword']}'") - new_articles = find_related_articles(story["id"], story["keyword"]) - if new_articles > 0: - stories_updated += 1 - total_new_articles += new_articles - print(f"[DEBUG] [story_tracking_service] [update_all_tracked_stories] Added {new_articles} new articles to story {story['id']}") - else: - print(f"[DEBUG] [story_tracking_service] [update_all_tracked_stories] No new articles found for story {story['id']}") - - print(f"[DEBUG] [story_tracking_service] [update_all_tracked_stories] Update complete. Updated {stories_updated} stories with {total_new_articles} new articles") - return { - "stories_updated": stories_updated, - "new_articles": total_new_articles - } - - except Exception as e: - print(f"[DEBUG] [story_tracking_service] [update_all_tracked_stories] Error updating tracked stories: {str(e)}") - raise e + # Uncomment when clustering functionality is implemented + # labels = cluster_articles(article_embeddings) + # print(f"[DEBUG] [story_tracking_service] [run_story_tracking] Clustering complete, found {len(labels) if labels else 0} labels") + # return labels + return [] + +# update_polling_stories function has been moved to backend.microservices.story_tracking.polling_service +# update_all_tracked_stories function has been moved to backend.microservices.story_tracking.story_manager if __name__ == '__main__': # Example usage - this code runs when the script is executed directly diff --git a/backend/microservices/summarization/__init__.py b/backend/microservices/summarization/__init__.py new file mode 100644 index 0000000..3f4d4a5 --- /dev/null +++ b/backend/microservices/summarization/__init__.py @@ -0,0 +1 @@ +# This file marks the directory as a Python package \ No newline at end of file diff --git a/backend/microservices/summarization/article_processor.py b/backend/microservices/summarization/article_processor.py new file mode 100644 index 0000000..9bcb996 --- /dev/null +++ b/backend/microservices/summarization/article_processor.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +""" +Article Processor Module + +This module provides functionality for processing news articles, including: +- Fetching article content +- Generating summaries +- Extracting keywords +- Managing bookmarks + +It integrates with Supabase for data persistence and OpenAI for text summarization. +""" + +import os +from supabase import create_client, Client +from dotenv import load_dotenv +from backend.core.utils import setup_logger, log_exception +from backend.microservices.summarization.content_fetcher import fetch_article_content +from backend.microservices.summarization.keyword_extractor import get_keywords + +# Import the summarization function from the utilities module +# This avoids circular imports while maintaining functionality +from backend.microservices.summarization.summarization_utils import run_summarization + +# Initialize logger +logger = setup_logger(__name__) + +# Load environment variables +load_dotenv('../../.env') # Optional: Only use this for local development + +# Initialize Supabase client +SUPABASE_URL = os.getenv("VITE_SUPABASE_URL") +SUPABASE_SERVICE_KEY = os.getenv("VITE_SUPABASE_ANON_KEY") +supabase: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY) + +logger.info("Article Processor Service initialized with Supabase configuration") + +@log_exception(logger) +def process_articles(article_ids, user_id): + """ + Processes a batch of articles associated with a specific session ID. + + This function performs the following operations: + 1. Retrieves articles from Supabase based on the session ID. + 2. Fetches missing content for articles if needed. + 3. Generates summaries for each article. + 4. Extracts keywords for filtering. + + Args: + article_ids (list): A list of article IDs to process. + user_id (str): The ID of the user for bookmark checking. + + Returns: + list: A list of dictionaries containing processed article data. + """ + try: + articles = [] + + # Step 1: Fetch the news_ids from user_bookmarks for the given user_id + logger.debug(f"Fetching bookmarks for user {user_id}") + bookmark_result = supabase.table("user_bookmarks").select("id, news_id").eq("user_id", user_id).execute() + + bookmark_records = {} + if bookmark_result.data: + bookmark_records = {item["news_id"]: item["id"] for item in bookmark_result.data} + + bookmarked_news_ids = set(item["news_id"] for item in bookmark_result.data) if bookmark_result.data else set() + + logger.debug(f"Bookmarked news IDs: {bookmarked_news_ids}") + logger.debug(f"Article IDs to process: {article_ids}") + + # Step 2: Fetch all articles from news_articles using the article_ids + if article_ids: # Assuming article_ids is defined or fetched earlier + logger.debug(f"Fetching {len(article_ids)} articles from database") + result = supabase.table("news_articles").select("*").in_("id", article_ids).execute() + articles = result.data + + # Step 3: Add the 'bookmarked' key to each article + logger.debug(f"Adding bookmark information to {len(articles)} articles") + for article in articles: + article["bookmarked_id"] = bookmark_records.get(article["id"], None) + + logger.debug(f"Retrieved {len(articles)} articles for processing") + + summarized_articles = [] + for article in articles: + logger.info(f"Processing article: {article['title']}") + + content = article.get('content') + if not content: + logger.debug(f"No content found for article, fetching from URL: {article['url']}") + content = fetch_article_content(article['url']) + + if content: + logger.debug("Generating summary from fetched content") + summary = run_summarization(content) + else: + logger.debug("Generating summary from existing content") + summary = run_summarization(article.get('content', '')) + + logger.debug("Extracting keywords for filtering") + summarized_articles.append({ + 'id': article['id'], + 'title': article['title'], + 'author': article.get('author', 'Unknown Author'), + 'source': article.get('source'), + 'publishedAt': article.get('published_at'), + 'url': article['url'], + 'urlToImage': article.get('image'), + 'content': article.get('content', ''), + 'summary': summary, + 'filter_keywords': get_keywords(article.get('content', '')), + 'bookmarked_id': article.get('bookmarked_id', None) + }) + + logger.info(f"Successfully processed {len(summarized_articles)} articles") + return summarized_articles + + except Exception as e: + logger.error(f"Error processing articles: {str(e)}") + raise e \ No newline at end of file diff --git a/backend/microservices/summarization/content_fetcher.py b/backend/microservices/summarization/content_fetcher.py new file mode 100644 index 0000000..9a1b4f7 --- /dev/null +++ b/backend/microservices/summarization/content_fetcher.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +""" +Content Fetcher Module + +This module provides functionality for fetching and extracting content from news article URLs. +It handles various HTTP request exceptions and content parsing. +""" + +import requests +from bs4 import BeautifulSoup +from backend.core.utils import setup_logger, log_exception + +# Initialize logger +logger = setup_logger(__name__) + +@log_exception(logger) +def fetch_article_content(url): + """ + Fetches and extracts the main content from a given URL. + + Args: + url (str): The URL of the article to fetch content from. + + Returns: + str or None: The extracted article content as plain text. + Returns None if the fetch fails or content is invalid. + """ + try: + if not url or not url.startswith('http'): + logger.error(f"Invalid URL format: {url}") + return None + + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + + soup = BeautifulSoup(response.text, 'html.parser') + paragraphs = soup.find_all('p') + + if not paragraphs: + logger.warning(f"No content found at URL: {url}") + return None + + content = ' '.join([p.get_text() for p in paragraphs]) + return content + + except requests.exceptions.Timeout: + logger.error(f"Request timed out for URL: {url}") + return None + except requests.exceptions.SSLError: + logger.error(f"SSL verification failed for URL: {url}") + return None + except requests.exceptions.ConnectionError: + logger.error(f"Failed to connect to URL: {url}") + return None + except requests.exceptions.RequestException as e: + logger.error(f"Error fetching article content from {url}: {str(e)}") + return None + except Exception as e: + logger.error(f"Unexpected error processing {url}: {str(e)}") + return None \ No newline at end of file diff --git a/backend/microservices/summarization/keyword_extractor.py b/backend/microservices/summarization/keyword_extractor.py new file mode 100644 index 0000000..06abbe7 --- /dev/null +++ b/backend/microservices/summarization/keyword_extractor.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +""" +Keyword Extractor Module + +This module provides functionality for extracting keywords from text content using YAKE. +It helps identify key topics and themes in article content for better categorization and filtering. +""" + +import yake +from backend.core.utils import setup_logger, log_exception + +# Initialize logger +logger = setup_logger(__name__) + +@log_exception(logger) +def get_keywords(text, num_keywords=1): + """ + Extracts key phrases from the input text using YAKE keyword extraction. + + Args: + text (str): The input text to extract keywords from. + num_keywords (int, optional): Number of keywords to extract. Defaults to 1. + + Returns: + list: A list of extracted keywords/key phrases. + """ + kw_extractor = yake.KeywordExtractor(top=num_keywords, lan='en') + keywords = kw_extractor.extract_keywords(text) + return [kw[0] for kw in keywords] \ No newline at end of file diff --git a/backend/microservices/summarization/summarization_utils.py b/backend/microservices/summarization/summarization_utils.py new file mode 100644 index 0000000..1fb0f83 --- /dev/null +++ b/backend/microservices/summarization/summarization_utils.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +""" +Summarization Utilities Module + +This module provides core summarization functionality that can be used by other modules +without creating circular dependencies. + +Key Features: +- Text summarization using OpenAI's GPT models +""" + +import openai +from backend.core.config import Config +from backend.core.utils import setup_logger, log_exception + +# Initialize logger +logger = setup_logger(__name__) + +# Configure OpenAI with your API key from environment variables +openai.api_key = Config.OPENAI_API_KEY + +@log_exception(logger) +def run_summarization(text): + """ + Generates a concise summary of the provided text using OpenAI's GPT model. + + Args: + text (str): The input text to be summarized. + + Returns: + str: A summarized version of the input text (approximately 150 words). + Returns an error message if summarization fails. + + Note: + Uses OpenAI's GPT-4 (or your specified model) with specific parameters: + - Temperature: 0.5 + - Max tokens: 200 + """ + return "Summarized Text" + # try: + # response = openai.ChatCompletion.create( + # model="gpt-4o-mini", # Change to your desired model (e.g., "gpt-3.5-turbo") + # messages=[ + # {"role": "system", "content": "You are a helpful assistant that summarizes text in approximately 150 words."}, + # {"role": "user", "content": f"Please summarize the following text:\n\n{text}"} + # ], + # max_tokens=200, + # temperature=0.5 + # ) + # return response.choices[0].message.content.strip() + # except Exception as e: + # logger.error(f"Error in summarization: {str(e)}") + # return "Error generating summary" \ No newline at end of file diff --git a/backend/microservices/summarization_service.py b/backend/microservices/summarization_service.py index b7e4063..525f2f5 100755 --- a/backend/microservices/summarization_service.py +++ b/backend/microservices/summarization_service.py @@ -14,13 +14,17 @@ import json import requests -from bs4 import BeautifulSoup import openai from backend.core.config import Config from backend.core.utils import setup_logger, log_exception -import yake import os +# Import the refactored modules +from backend.microservices.summarization.content_fetcher import fetch_article_content +from backend.microservices.summarization.keyword_extractor import get_keywords +from backend.microservices.summarization.article_processor import process_articles +from backend.microservices.summarization.summarization_utils import run_summarization + # Initialize logger logger = setup_logger(__name__) @@ -40,206 +44,5 @@ supabase: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY) -@log_exception(logger) -def fetch_article_content(url): - """ - Fetches and extracts the main content from a given URL. - - Args: - url (str): The URL of the article to fetch content from. - - Returns: - str or None: The extracted article content as plain text. - Returns None if the fetch fails or content is invalid. - """ - try: - if not url or not url.startswith('http'): - logger.error(f"Invalid URL format: {url}") - return None - - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - } - response = requests.get(url, headers=headers, timeout=10) - response.raise_for_status() - - soup = BeautifulSoup(response.text, 'html.parser') - paragraphs = soup.find_all('p') - - if not paragraphs: - logger.warning(f"No content found at URL: {url}") - return None - - content = ' '.join([p.get_text() for p in paragraphs]) - return content - - except requests.exceptions.Timeout: - logger.error(f"Request timed out for URL: {url}") - return None - except requests.exceptions.SSLError: - logger.error(f"SSL verification failed for URL: {url}") - return None - except requests.exceptions.ConnectionError: - logger.error(f"Failed to connect to URL: {url}") - return None - except requests.exceptions.RequestException as e: - logger.error(f"Error fetching article content from {url}: {str(e)}") - return None - except Exception as e: - logger.error(f"Unexpected error processing {url}: {str(e)}") - return None - - -@log_exception(logger) -def run_summarization(text): - """ - Generates a concise summary of the provided text using OpenAI's GPT model. - - Args: - text (str): The input text to be summarized. - - Returns: - str: A summarized version of the input text (approximately 150 words). - Returns an error message if summarization fails. - - Note: - Uses OpenAI's GPT-4 (or your specified model) with specific parameters: - - Temperature: 0.5 - - Max tokens: 200 - """ - try: - response = openai.ChatCompletion.create( - model="gpt-4o-mini", # Change to your desired model (e.g., "gpt-3.5-turbo") - messages=[ - {"role": "system", "content": "You are a helpful assistant that summarizes text in approximately 150 words."}, - {"role": "user", "content": f"Please summarize the following text:\n\n{text}"} - ], - max_tokens=200, - temperature=0.5 - ) - return response.choices[0].message.content.strip() - except Exception as e: - logger.error(f"Error in summarization: {str(e)}") - return "Error generating summary" - - -@log_exception(logger) -def get_keywords(text, num_keywords=1): - """ - Extracts key phrases from the input text using YAKE keyword extraction. - - Args: - text (str): The input text to extract keywords from. - num_keywords (int, optional): Number of keywords to extract. Defaults to 1. - - Returns: - list: A list of extracted keywords/key phrases. - """ - kw_extractor = yake.KeywordExtractor(top=num_keywords, lan='en') - keywords = kw_extractor.extract_keywords(text) - return [kw[0] for kw in keywords] - - -@log_exception(logger) -def process_articles(article_ids,user_id): - """ - Processes a batch of articles associated with a specific session ID. - - This function performs the following operations: - 1. Retrieves articles from Supabase based on the session ID. - 2. Fetches missing content for articles if needed. - 3. Generates summaries for each article. - 4. Extracts keywords for filtering. - - Args: - article_ids (list): A list of article IDs to process. - - Returns: - list: A list of dictionaries containing processed article data. - """ - try: - # history_result = supabase.table("user_search_history").select("news_id").eq("session_id", session_id).execute() - # article_ids = [record["news_id"] for record in history_result.data] - - # articles = [] - # if article_ids: - # result = supabase.table("news_articles").select("*").in_("id", article_ids).execute() - # articles = result.data - - - articles = [] - - # Step 1: Fetch the news_ids from user_bookmarks for the given user_id - bookmark_result = supabase.table("user_bookmarks").select("news_id").eq("user_id", user_id).execute() - bookmark_result = supabase.table("user_bookmarks").select("id, news_id").eq("user_id", user_id).execute() - - bookmark_records = {} - if bookmark_result.data: - bookmark_records = {item["news_id"]: item["id"] for item in bookmark_result.data} - - - bookmarked_news_ids = set(item["news_id"] for item in bookmark_result.data) if bookmark_result.data else set() - - print(f"Bookmarked news IDs: {bookmarked_news_ids}") - print(f"Article IDs: {article_ids}") - - # Step 2: Fetch all articles from news_articles using the article_ids - if article_ids: # Assuming article_ids is defined or fetched earlier - result = supabase.table("news_articles").select("*").in_("id", article_ids).execute() - articles = result.data - - # Step 3: Add the 'bookmarked' key to each article - for article in articles: - # article["bookmarked_id"] = article["id"] if article["id"] in bookmarked_news_ids else None - article["bookmarked_id"] = bookmark_records.get(article["id"], None) - - # # If article_ids isn't defined earlier, you can fetch it here as well - # if not article_ids and bookmark_result.data: - # article_ids = [item["news_id"] for item in bookmark_result.data] - # result = supabase.table("news_articles").select("*").in_("id", article_ids).execute() - # articles = result.data - # for article in articles: - # article["bookmarked"] = "yes" # All articles here are bookmarked - - - - - - print(articles) - - summarized_articles = [] - for article in articles: - logger.info(f"Processing article: {article['title']}") - - content = article.get('content') - if not content: - content = fetch_article_content(article['url']) - - if content: - summary = run_summarization(content) - else: - summary = run_summarization(article.get('content', '')) - - summarized_articles.append({ - 'id': article['id'], - 'title': article['title'], - 'author': article.get('author', 'Unknown Author'), - 'source': article.get('source'), - 'publishedAt': article.get('published_at'), - 'url': article['url'], - 'urlToImage': article.get('image'), - 'content': article.get('content', ''), - 'summary': summary, - 'filter_keywords': get_keywords(article.get('content', '')), - 'bookmarked_id': article.get('bookmarked_id', None) - }) - - return summarized_articles - - except Exception as e: - logger.error(f"Error processing articles: {str(e)}") - raise e - - if __name__ == '__main__': process_articles() \ No newline at end of file diff --git a/data/.DS_Store b/data/.DS_Store deleted file mode 100644 index d6eb9db..0000000 Binary files a/data/.DS_Store and /dev/null differ diff --git a/docs/story-tracking-documentation.md b/docs/story-tracking-documentation.md new file mode 100644 index 0000000..dbfbf03 --- /dev/null +++ b/docs/story-tracking-documentation.md @@ -0,0 +1,205 @@ +# Story Tracking Documentation + +## Overview + +Story Tracking is a feature in the News Aggregator application that allows users to track specific news topics over time. The system works by: + +1. Enabling users to select keywords to track +2. Backend polling for new articles related to these keywords +3. Real-time frontend updates using Supabase subscriptions +4. Providing users control over which stories are tracked and when polling occurs + +## Architecture + +The Story Tracking feature follows a backend-first, frontend-next approach: + +### Backend Components + +1. **Story Tracking Service** (`story_tracking_service.py`) + - Core functionality for tracking stories by keyword + - Manages tracked stories in the database + - Handles polling logic for finding new articles + - Provides functions for story management (create, get, delete, etc.) + +2. **API Gateway** (`api_gateway.py`) + - Exposes RESTful endpoints for frontend interaction + - Routes for creating, retrieving, and deleting tracked stories + - Special endpoints for controlling polling (`/start` and `/stop`) + - Authentication middleware to secure operations + +3. **Polling Worker** + - Background process that checks for stories due for polling + - Fetches new articles for tracked stories + - Updates the database with new articles + +### Frontend Components + +1. **StoryTrackingContext** (`StoryTrackingContext.tsx`) + - Provides app-wide state management for tracked stories + - Handles API calls to the backend for story operations + - Exposes functions for starting/stopping tracking and polling + +2. **StoryTrackingPage** (`StoryTrackingPage.tsx`) + - UI for viewing and managing a tracked story + - Controls for toggling automatic updates (polling) + - Displays real-time updates of new articles + +3. **StoryTrackingTabContext** (`StoryTrackingTabContext.tsx`) + - Manages real-time subscription to Supabase for updates + - Displays articles for a specific tracked story + - Handles formatting and sorting of article data + +4. **ArticleView** (`ArticleView.tsx`) + - Provides tracking button in article view + - Allows users to track stories from individual articles + +### Database Schema + +The feature uses three main tables in Supabase: + +1. `tracked_stories` + - `id`: Unique identifier for each tracked story + - `user_id`: The user tracking the story + - `keyword`: The keyword/phrase being tracked + - `created_at`: When tracking started + - `is_polling`: Whether automatic polling is enabled + - `last_polled_at`: When the story was last checked for updates + +2. `tracked_story_articles` + - `id`: Unique identifier for the tracked article association + - `tracked_story_id`: Foreign key to tracked_stories + - `news_id`: Foreign key to news_articles + - `added_at`: When this article was added to the tracked story + +3. `news_articles` + - Contains all article data + - Used by the tracking system to store and retrieve articles + +## API Endpoints + +The API Gateway provides the following endpoints for story tracking: + +1. **GET `/api/story_tracking`** + - Gets news articles for a keyword + - Query params: `keyword` + - No authentication required + +2. **POST `/api/story_tracking`** + - Creates a new tracked story + - Body: `{ keyword, sourceArticleId? }` + - Requires authentication + +3. **GET `/api/story_tracking/user`** + - Gets all tracked stories for the authenticated user + - Requires authentication + +4. **GET `/api/story_tracking/{story_id}`** + - Gets details for a specific story including articles + - Requires authentication + +5. **DELETE `/api/story_tracking/{story_id}`** + - Deletes a tracked story + - Requires authentication + +6. **POST `/api/story_tracking/start`** + - Starts polling for a tracked story + - Body: `{ story_id }` + - Requires authentication + +7. **POST `/api/story_tracking/stop`** + - Stops polling for a tracked story + - Body: `{ story_id }` + - Requires authentication + +## Frontend Service Layer + +The `storyTrackingService.ts` provides a clean interface for the frontend to interact with the backend: + +1. `createTrackedStory(keyword, sourceArticleId?)`: Create a new tracked story +2. `getTrackedStories()`: Retrieve all tracked stories for the user +3. `getTrackedStory(id)`: Get details for a specific story +4. `deleteTrackedStory(id)`: Stop tracking a story +5. `startPolling(storyId)`: Enable automatic updates for a story +6. `stopPolling(storyId)`: Disable automatic updates for a story + +## Data Flow + +### Creating and Tracking a Story + +1. User clicks on the tracking button in ArticleView +2. Frontend navigates to `/story-tracking/{keyword}` +3. StoryTrackingPage mounts and calls `startTracking(keyword)` +4. StoryTrackingContext makes a POST call to `/api/story_tracking` with the keyword +5. API Gateway creates a tracked story in the database using `create_tracked_story()` +6. Backend searches for and associates relevant articles with the story +7. Response with story details is sent back to the frontend +8. StoryTrackingContext updates its state with the new story +9. StoryTrackingPage displays the story details + +### Real-time Updates + +1. StoryTrackingTabContext sets up a Supabase subscription when a story page is opened +2. The subscription listens for INSERT events on the `tracked_story_articles` table +3. When an article is added by the backend polling process: + - Supabase sends a real-time notification to the frontend + - Frontend receives the article ID and fetches full details + - New article is added to the UI without page refresh + +### Controlling Polling + +1. User clicks "Auto-update" button on StoryTrackingPage +2. Frontend calls `togglePolling(storyId, true/false)` +3. StoryTrackingContext calls either `startPolling()` or `stopPolling()` +4. Request is sent to `/api/story_tracking/start` or `/api/story_tracking/stop` +5. Backend updates the `is_polling` flag on the tracked story +6. Polling Worker recognizes the change and includes/excludes the story from polling + +## Polling Logic (Backend) + +1. The Polling Worker runs as a background process +2. It periodically checks for stories with `is_polling = true` +3. For each polling-enabled story: + - Check if it's due for polling (based on `last_polled_at` and polling frequency) + - Fetch new articles using the story's keyword + - Associate new articles with the story in `tracked_story_articles` + - Update `last_polled_at` timestamp + +## Error Handling + +- Frontend shows loading states during API calls +- Timeout detection for long-running operations +- Error messages displayed to users +- Fallbacks for when real-time subscriptions fail + +## Authentication Flow + +All story tracking operations (except initial keyword search) require authentication: + +1. Frontend gets the current session token from Supabase Auth +2. Token is included in all API requests as a Bearer token +3. Backend validates the token using the JWT middleware +4. Operations are only performed for the authenticated user + +## Code Relationships + +- `StoryTrackingContext.tsx` is the central connector that: + - Provides state to all story tracking components + - Makes API calls through `storyTrackingService.ts` + - Updates state based on API responses + +- `StoryTrackingPage.tsx` uses the context to: + - Display a specific tracked story + - Control polling status + - Remove tracking when requested + +- `StoryTrackingTabContext.tsx` handles: + - Real-time subscriptions to story updates + - Rendering and formatting articles + +## Future Improvements + +- Enhanced error recovery for polling processes +- Improved article relevance through better keyword matching +- User preferences for polling frequency +- Support for more complex tracking queries beyond simple keywords +- Email or push notifications for important story updates \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index f24af4a..56df61d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -37,4 +37,7 @@ psycopg2-binary pytest flask_cors -flask_restx \ No newline at end of file +flask_restx + +# Scheduling & Background Tasks +schedule \ No newline at end of file diff --git a/start-polling-worker.sh b/start-polling-worker.sh new file mode 100755 index 0000000..2dd96d5 --- /dev/null +++ b/start-polling-worker.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# start-polling-worker.sh +# Script to start the polling worker for tracking news stories + +echo "Starting News Aggregator Polling Worker..." + +# Activate virtual environment if it exists +if [ -d "venv" ]; then + echo "Activating virtual environment..." + source venv/bin/activate +fi + +# Install dependencies if needed +if [ "$1" == "--install" ]; then + echo "Installing dependencies..." + pip install -r requirements.txt +fi + +# Set environment variables from .env file if it exists +if [ -f ".env" ]; then + echo "Loading environment variables from .env file..." + export $(grep -v '^#' .env | xargs) +fi + +# Start the polling worker +echo "Starting polling worker..." +python -m backend.microservices.polling_worker + +# Keep this script running until manually terminated +echo "Polling worker started. Press Ctrl+C to stop." \ No newline at end of file diff --git a/start-services.sh b/start-services.sh new file mode 100644 index 0000000..5967103 --- /dev/null +++ b/start-services.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -e + +echo "Starting NewsFeast services..." + +# Start polling worker in the background +echo "Starting polling worker..." +python -m backend.microservices.polling_worker & + +# Start API gateway in the foreground +echo "Starting API gateway..." +exec python backend/api_gateway/api_gateway.py \ No newline at end of file diff --git a/vercel.json b/vercel.json deleted file mode 100644 index 600340f..0000000 --- a/vercel.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "version": 2, - "builds": [ - { - "src": "backend/api_gateway/api_gateway.py", - "use": "@vercel/python" - } - ], - "routes": [ - { - "src": "/api/(.*)", - "dest": "backend/api_gateway/api_gateway.py" - }, - { - "src": "/health", - "dest": "backend/api_gateway/api_gateway.py" - }, - { - "src": "/summarize", - "dest": "backend/api_gateway/api_gateway.py" - } - ], - "env": { - "VITE_SUPABASE_URL": "@VITE_SUPABASE_URL", - "SUPABASE_SERVICE_ROLE_KEY": "@SUPABASE_SERVICE_ROLE_KEY", - "JWT_SECRET_KEY": "@JWT_SECRET_KEY", - "OPENAI_API_KEY": "@OPENAI_API_KEY" - } -} \ No newline at end of file