Intermediate
Data Collection
Fetch historical stock prices with yfinance, collect financial news headlines from NewsAPI, and engineer features for the prediction model.
Step 1: Stock Price Data
# app/data_collector.py
import yfinance as yf
import pandas as pd
import logging
from datetime import datetime, timedelta
logger = logging.getLogger(__name__)
class StockDataCollector:
def fetch_history(self, ticker: str, days: int = 730) -> pd.DataFrame:
"""Fetch historical OHLCV data."""
stock = yf.Ticker(ticker)
end = datetime.now()
start = end - timedelta(days=days)
df = stock.history(start=start, end=end)
df.index = pd.to_datetime(df.index)
df = df[["Open", "High", "Low", "Close", "Volume"]]
logger.info(f"Fetched {len(df)} days for {ticker}")
return df
def fetch_multiple(self, tickers: list[str], days: int = 730) -> dict[str, pd.DataFrame]:
"""Fetch data for multiple tickers."""
return {t: self.fetch_history(t, days) for t in tickers}
Step 2: News Headlines
# News collection with NewsAPI
from newsapi import NewsApiClient
class NewsCollector:
def __init__(self, api_key: str):
self.api = NewsApiClient(api_key=api_key)
def fetch_headlines(self, query: str, days: int = 30) -> list[dict]:
"""Fetch recent news headlines for a company."""
from_date = (datetime.now() - timedelta(days=days)).strftime("%Y-%m-%d")
result = self.api.get_everything(
q=query, from_param=from_date,
language="en", sort_by="relevancy", page_size=100,
)
articles = []
for a in result.get("articles", []):
articles.append({
"title": a["title"],
"description": a.get("description", ""),
"published_at": a["publishedAt"],
"source": a["source"]["name"],
})
logger.info(f"Fetched {len(articles)} articles for {query}")
return articles
Step 3: Feature Engineering
class FeatureEngineer:
def add_returns(self, df: pd.DataFrame) -> pd.DataFrame:
"""Add return columns."""
df["daily_return"] = df["Close"].pct_change()
df["log_return"] = np.log(df["Close"] / df["Close"].shift(1))
df["volatility_20d"] = df["daily_return"].rolling(20).std()
return df
def add_lag_features(self, df: pd.DataFrame, lags: int = 5) -> pd.DataFrame:
"""Add lagged price features."""
for i in range(1, lags + 1):
df[f"close_lag_{i}"] = df["Close"].shift(i)
df[f"return_lag_{i}"] = df["daily_return"].shift(i)
return df
def add_volume_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""Add volume-based features."""
df["volume_sma_20"] = df["Volume"].rolling(20).mean()
df["volume_ratio"] = df["Volume"] / df["volume_sma_20"]
return df
Testing
collector = StockDataCollector()
df = collector.fetch_history("AAPL", days=365)
print(f"Shape: {df.shape}")
print(df.tail())
Data quality: Always check for missing values, stock splits, and dividends. yfinance adjusts for splits by default, but verify with
df.isnull().sum() and forward-fill any gaps.Key Takeaways
- yfinance provides free OHLCV data with automatic adjustment for splits and dividends.
- NewsAPI free tier allows 100 requests/day, sufficient for daily sentiment updates.
- Feature engineering adds returns, volatility, lag features, and volume ratios.
- Always validate data quality before feeding it to the model.
Lilly Tech Systems