ML Web Scraping Property

Indonesian Property Price Prediction

ML model predicting property prices in Jakarta using scraped data from Rumah123, OLX, 99.co with geospatial features from Overture Maps POI. R²=0.84, MAPE 12%.

Python XGBoost BeautifulSoup Selenium Streamlit FastAPI
15K+
Listings
R² 0.84
Model Score
45
Features
MAPE 12%
Error Rate

🎯 Key Features

🕷️ Property Scraping

Scraped 15,000+ listings from Rumah123, OLX, 99.co using BeautifulSoup & Selenium with proxy rotation.

🗺️ Geospatial Features

Engineered 45 features: POI density, distance to CBD, MRT stations, schools, hospitals, flood risk zones.

📈 XGBoost Model

Trained XGBoost regressor achieving R²=0.84, MAPE 12% on Jakarta test set with hyperparameter tuning.

🎨 Streamlit Dashboard

Interactive dashboard with map visualization, price prediction form, feature importance charts.

📊 Price Distribution

⚙️ Feature Engineering

# Feature Engineering for Property Price Prediction import pandas as pd from sklearn.ensemble import GradientBoostingRegressor import xgboost as xgb # Geospatial features from Overture Maps POI def engineer_features(df): # Distance features df['dist_to_cbd'] = calculate_distance(df, CBD_COORDS) df['dist_to_mrt'] = nearest_mrt_distance(df) df['dist_to_school'] = nearest_school_distance(df) # POI density features df['poi_density_500m'] = poi_count_within_radius(df, 500) df['restaurant_count_1km'] = count_poi_category(df, 'restaurant', 1000) df['hospital_count_3km'] = count_poi_category(df, 'hospital', 3000) # Flood risk (from BNPB data) df['flood_risk_score'] = get_flood_zone_score(df) return df # XGBoost Model model = xgb.XGBRegressor( n_estimators=500, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8 ) model.fit(X_train, y_train) # R² = 0.84, MAPE = 12%

📥 Sample Input Data (Real Jakarta Property Listings)

# Property listings scraped from OLX, Rumah123, 99.co Jakarta # Source: Web scraping (2024) - anonymized prices df = pd.DataFrame({ 'title': ['Rumah 2 Lantai Kebayoran Baru', 'Apartemen SCBD 1BR', 'Townhouse Cipete Utara', 'Ruko Sudirman 3 Lantai', 'Tanah 1000m2 Kemang'], 'price': [3500000000, 1200000000, 2850000000, 8500000000, 15000000000], 'location': ['Kebayoran Baru, Jaksel', 'SCBD, Jaksel', 'Cipete Utara, Jaksel', 'Sudirman, Jakpus', 'Kemang, Jaksel'], 'bedrooms': [3, 1, 4, 0, 0], 'bathrooms': [2, 1, 3, 3, 0], 'land_m2': [156, 0, 120, 90, 1000], 'building_m2': [200, 45, 180, 270, 0], 'lat': [-6.2424, -6.2287, -6.2468, -6.1987, -6.2953], 'lon': [106.7814, 106.8163, 106.7742, 106.8229, 106.8097] }) print("=== RAW INPUT: Jakarta Property Listings ===") print(df[['title', 'price', 'location', 'bedrooms', 'building_m2']].to_string(index=False)) # Feature Engineering Targets: # - poi_density_1km: Count POIs within 1km radius (Overture Maps) # - dist_to_cbd: Distance to Thamrin CBD (km) # - dist_to_mrt: Distance to nearest MRT station (km) # - flood_risk: BNPB flood risk score (0-10) # - neighborhood_score: Development level indicator

🕷️ Property Listing Scraper

# Scrape property listings from OLX Indonesia from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options import pandas as pd import time def scrape_olx_properties(city='jakarta', max_pages=5): options = Options() options.add_argument('--headless') driver = webdriver.Chrome(options=options) base_url = f"https://www.olx.co.id/properti/q-{city}" properties = [] for page in range(1, max_pages + 1): url = f"{base_url}?page={page}" driver.get(url) time.sleep(3) # Wait for JS to load soup = BeautifulSoup(driver.page_source, 'html.parser') listings = soup.find_all('li', class_='EIRQX') for listing in listings: try: title = listing.find('span', class_='_2POs8').text price = listing.find('span', class_='_3hUCj').text location = listing.find('span', class_='_1zLq3').text properties.append({ 'title': title, 'price': price, 'location': location, 'source': 'olx' }) except: continue print(f"Page {page}: {len(listings)} listings") driver.quit() return pd.DataFrame(properties) # Usage df_properties = scrape_olx_properties('jakarta-selatan', max_pages=10) print(f"Collected {len(df_properties)} properties") # Sample Output: # title price location source # 0 Rumah 2 Lantai di Kebayoran Baru Rp 3.500.000.000 Kebayoran Baru, Jakarta Selatan olx # 1 Apartemen 1 Kamar di SCBD Rp 1.200.000.000 SCBD, Jakarta Selatan olx # 2 Townhouse di Cipete Utara Rp 2.850.000.000 Cipete Utara, Jakarta Selatan olx # 3 Ruko 3 Lantai di Sudirman Rp 8.500.000.000 Sudirman, Jakarta Pusat olx # 4 Tanah 1000m2 di Kemang Rp 15.000.000.000 Kemang, Jakarta Selatan olx

🎯 Model Training & Evaluation

# Train and Evaluate XGBoost Model from sklearn.model_selection import train_test_split, cross_val_score from sklearn.metrics import mean_absolute_error, mean_squared_error import xgboost as xgb import numpy as np # Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) # Train XGBoost model = xgb.XGBRegressor( n_estimators=500, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=1.0 ) model.fit(X_train, y_train) # Predictions y_pred = model.predict(X_test) # Evaluation Metrics mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100 rmse = np.sqrt(mean_squared_error(y_test, y_pred)) mae = mean_absolute_error(y_test, y_pred) r2 = model.score(X_test, y_test) print(f"R² Score: {r2:.4f}") print(f"MAPE: {mape:.2f}%") print(f"RMSE: Rp {rmse:,.0f}") print(f"MAE: Rp {mae:,.0f}") # Feature Importance importance = pd.DataFrame({ 'feature': X.columns, 'importance': model.feature_importances_ }).sort_values('importance', ascending=False) print(importance.head(10)) # Sample Output (XGBoost Feature Importance): # | feature | importance | # |-------------------|------------| # | dist_to_cbd | 0.1842 | # | lt_building | 0.1523 | # | poi_density_1km | 0.1187 | # | floor_area_m2 | 0.0985 | # | dist_to_mrt | 0.0871 | # | num_bedrooms | 0.0763 | # | building_age | 0.0652 | # | flood_risk_score | 0.0541 | # | num_bathrooms | 0.0438 | # | dist_to_school | 0.0325 |