I implement parser for my university schedule (Innopolis University) from google spreadsheets to .ics files. It looks like this:
Header (1st and 2nd rows) is Course and Group. On the left side – column with weekday and time.
Each cell here represent schedule entry (they repeat every week), also each “cell” contains three lines: 1st – name of event, 2nd – teachers, 3rd – location and some modifiers such as start time, change of location on specific day, “this event will be on specified days, not every week”.
I wrote almost everything except the parsing for 3rds strings in event (location string) because it may contain complex patterns, in general, this can contain almost everything that comes to the person who makes the schedule, but I collected a set of what was on the schedule.
Maybe you have ideas on how to implement this? Approximate approaches, using ml, and so on are also suitable. I am attaching a script for testing (may be there is need to reformulate tests) with my solution that fails 313 (WEEK 1-3) / ONLINE
, ONLINE ON 13/09, 108 ON 01/11 (STARTS AT 9:00)
and 314 (312 ON 12/09,19/09,26/09) 301 ON 03/10
:
import re
from datetime import date, datetime, time
from functools import partial
from unittest import TestCase
from pydantic import BaseModel
import pytest
ydate = partial(date, year=datetime.today().year)
class Item(BaseModel):
location: str | None = None
starts_from: date | None = None
starts_at: time | None = None
till: time | None = None
on_weeks: list[int] | None = None
on: list[date] | None = None
NEST: list["Item"] | None = None
class Config:
arbitrary_types_allowed = True
Item.update_forward_refs()
cases = [
# Simple
("303", Item(location="303")),
("room 107", Item(location="107")),
("room #107", Item(location="107")),
("ROOM #107", Item(location="107")),
("ONLINE", Item(location="ONLINE")),
("online", Item(location="ONLINE")),
("106/313/314/316/318/320/421", Item(location="106/313/314/316/318/320/421")),
("105/ (ONLINE)", Item(location="105/ONLINE")),
# starts_from modifier
("STARTS ON 2/10", Item(starts_from=ydate(day=2, month=10))),
("STARTS FROM 21/09", Item(starts_from=ydate(day=21, month=9))),
("304 Starts from 19/09", Item(location="304", starts_from=ydate(day=19, month=9))),
("313 (STARTS FROM 21/09)", Item(location="313", starts_from=ydate(day=21, month=9))),
# starts_at modifier
("STARTS AT 16.10", Item(starts_at=time(hour=16, minute=10))),
("107 STARTS AT 16.10", Item(location="107", starts_at=time(hour=16, minute=10))),
("107 (STARTS AT 10.50)", Item(location="107", starts_at=time(hour=10, minute=50))),
# week modifiers
("WEEK 2-4 ONLY", Item(on_weeks=[2, 3, 4])),
("105 (WEEK 2-3 ONLY)", Item(location="105", on_weeks=[2, 3])),
("105 (WEEK 2, 4 ONLY)", Item(location="105", on_weeks=[2, 4])),
("105 (WEEK 2 ONLY)", Item(location="105", on_weeks=[2])),
("105 (WEEK 2)", Item(location="105", on_weeks=[2])),
# on modifier
("ON 13/09", Item(on=[ydate(day=13, month=9)])),
("ONLY ON 13/09", Item(on=[ydate(day=13, month=9)])),
("ONLY ON 13/09, 20/09", Item(on=[ydate(day=13, month=9), ydate(day=20, month=9)])),
("ONLINE ON 13/09", Item(location="ONLINE", on=[ydate(day=13, month=9)])),
(
"107 (ONLY ON 8/09, 29/09, 27/10, 17/11)",
Item(
location="107",
on=[ydate(day=8, month=9), ydate(day=29, month=9), ydate(day=27, month=10), ydate(day=17, month=11)],
),
),
(
"107 (ON 8/09, 29/09, 27/10, 17/11)",
Item(
location="107",
on=[ydate(day=8, month=9), ydate(day=29, month=9), ydate(day=27, month=10), ydate(day=17, month=11)],
),
),
(
"ONLINE (only on 31/08 and 14/09)",
Item(location="ONLINE", on=[ydate(day=31, month=8), ydate(day=14, month=9)]),
),
# till modifier
("TILL 18:00", Item(till=time(hour=18, minute=0))),
("107 (TILL 18:00)", Item(location="107", till=time(hour=18, minute=0))),
# Multiple modifiers
(
"STARTS AT 18:00 TILL 21:00",
Item(starts_at=time(hour=18, minute=0), till=time(hour=21, minute=0)),
),
(
"TILL 21:00 STARTS AT 18:00",
Item(starts_at=time(hour=18, minute=0), till=time(hour=21, minute=0)),
),
(
"(STARTS AT 18:00) TILL 21:00",
Item(starts_at=time(hour=18, minute=0), till=time(hour=21, minute=0)),
),
(
"ON 13/09 STARTS AT 18:00",
Item(on=[ydate(day=13, month=9)], starts_at=time(hour=18, minute=0)),
),
(
"ONLINE ON 13/09 STARTS AT 18:00",
Item(location="ONLINE", on=[ydate(day=13, month=9)], starts_at=time(hour=18, minute=0)),
),
(
"107 (TILL 21:00) STARTS AT 18:00",
Item(location="107", starts_at=time(hour=18, minute=0), till=time(hour=21, minute=0)),
),
# NEST
("317 (421 ON 11/10)", Item(location="317", NEST=[Item(location="421", on=[ydate(day=11, month=10)])])),
(
"105 (room #107 on 28/08)",
Item(location="105", NEST=[Item(location="107", on=[ydate(day=28, month=8)])]),
),
(
"313 (WEEK 1-3) / ONLINE",
Item(location="ONLINE", NEST=[Item(location="107", on_weeks=[0, 1, 2])]),
),
(
"ONLINE ON 13/09, 108 ON 01/11 (STARTS AT 9:00)",
Item(
starts_at=time(hour=9, minute=0),
NEST=[
Item(location="ONLINE", on=[ydate(day=13, month=9)]),
Item(location="108", on=[ydate(day=1, month=11)]),
],
),
),
(
"314 (312 ON 12/09,19/09,26/09) 301 ON 03/10",
Item(
location="314",
NEST=[
Item(
location="312",
on=[
ydate(day=12, month=9),
ydate(day=19, month=9),
ydate(day=26, month=9),
],
),
Item(location="301", on=[ydate(day=3, month=10)]),
],
),
),
(
"107 (STARTS at 18:00) TILL 21:00",
Item(location="107", starts_at=time(hour=18, minute=0), till=time(hour=21, minute=0)),
),
]
def f(x: str, from_parent: bool = False) -> Item | None:
x = x.upper()
x = x.replace("(ONLINE)", "ONLINE")
x = x.strip()
# replace AND with ,
x = re.sub(r"s+ANDs+", ", ", x)
def combine_patterns(patterns):
return r"(" + "|".join(patterns) + r")"
def simple_location(y: str):
if m := re.fullmatch(r"^(d+)$", y):
return m.group(1)
if m := re.fullmatch(r"^ROOMs*#?s*(d+)$", y):
return m.group(1)
if m := re.fullmatch(r"^ONLINE$", y):
return m.group(0)
if m := re.fullmatch(r"^((d|ONLINE)+(?:s*/s*(d|ONLINE)+)+)$", y):
locations = m.group(1)
locations = locations.split("/")
locations = [l.strip() for l in locations]
return "/".join(locations)
_simple_location_pattern = combine_patterns(
[r"(d+)", r"ROOMs*#?s*(d+)", r"ONLINE", r"((d|ONLINE)+(?:s*/s*(d|ONLINE)+)+)"]
)
def location_plus_pattern(group_name: str, pattern: str):
return rf"(?P<location>{_simple_location_pattern}) (?(?P<{group_name}>{pattern}))?"
if as_simple_location := simple_location(y=x):
return Item(location=as_simple_location)
_starts_from_pattern = r"STARTSs*(ON|FROM)s*(d{1,2}[/.]d{1,2})"
def starts_from(y: str):
if m := re.fullmatch(_starts_from_pattern, y):
_date = m.group(2).replace(".", "/")
day, month = _date.split(sep="/")
return Item(starts_from=ydate(day=int(day), month=int(month)))
_starts_at_pattern = r"STARTSs*(AT)s*(d{1,2}[:.]d{1,2})"
def starts_at(y: str):
if m := re.fullmatch(_starts_at_pattern, y):
_time = m.group(2).replace(".", ":")
hour, minute = _time.split(sep=":")
return Item(starts_at=time(hour=int(hour), minute=int(minute)))
_week_pattern = r"WEEKs*(?P<weeks>d+(?:-d+)?(?:,s*d+(?:-d+)?)*)(?:s+ONLY)?"
def week(y: str):
if m := re.fullmatch(_week_pattern, y):
weeks = m.group("weeks")
weeks = weeks.split(",")
weeks = [w.split("-") for w in weeks]
weeks = [list(range(int(w[0]), int(w[1]) + 1)) if len(w) == 2 else [int(w[0])] for w in weeks]
weeks = [item for sublist in weeks for item in sublist]
return Item(on_weeks=weeks)
_on_pattern = r"(ON|ONLYs+ON)s*(?P<dates>(d{1,2}[/.]d{1,2}(?:,s*d{1,2}[/.]d{1,2})*))"
def on(y: str):
if m := re.fullmatch(_on_pattern, y):
dates = m.group("dates")
dates = dates.split(",")
dates = [d.replace(".", "/") for d in dates]
dates = [d.split("/") for d in dates]
dates = [ydate(day=int(d[0]), month=int(d[1])) for d in dates]
return Item(on=dates)
_till_pattern = r"TILLs*(?P<time>d{1,2}[:.]d{1,2})"
def till(y: str):
if m := re.fullmatch(_till_pattern, y):
_time = m.group("time").replace(".", ":")
hour, minute = _time.split(sep=":")
return Item(till=time(hour=int(hour), minute=int(minute)))
_any_modifier_pattern = combine_patterns(
[_starts_from_pattern, _starts_at_pattern, _week_pattern, _on_pattern, _till_pattern]
)
def any_modifier(y: str):
if m := re.fullmatch(_any_modifier_pattern, y):
z = m.group(0)
if as_starts_from := starts_from(z):
return as_starts_from
if as_starts_at := starts_at(z):
return as_starts_at
if as_week := week(z):
return as_week
if as_on := on(z):
return as_on
if as_till := till(z):
return as_till
if as_any_modifier := any_modifier(x):
return as_any_modifier
if m := re.fullmatch(location_plus_pattern("any_modifier", _any_modifier_pattern), x):
location = simple_location(m.group("location"))
as_any_modifier = any_modifier(m.group("any_modifier"))
as_any_modifier.location = location
return as_any_modifier
# replace all named groups with non-capturing groups
_any_modifier_pattern_noname = re.sub(r"(?P<[^>]+>", "(?:", _any_modifier_pattern)
_two_modifiers_pattern = (
rf"(?(?P<first>{_any_modifier_pattern_noname}))?s*(?(?P<second>{_any_modifier_pattern_noname}))?"
)
def two_modifiers(y: str):
if m := re.fullmatch(_two_modifiers_pattern, y):
z1, z2 = m.group("first"), m.group("second")
as_z1 = any_modifier(z1)
as_z2 = any_modifier(z2)
if as_z1 and as_z2:
combined = as_z1.dict(exclude_none=True) | as_z2.dict(exclude_none=True)
return Item.parse_obj(combined)
if as_two_modifiers := two_modifiers(x):
return as_two_modifiers
if m := re.fullmatch(location_plus_pattern("two_modifiers", _two_modifiers_pattern), x):
location = simple_location(m.group("location"))
as_two_modifiers = two_modifiers(m.group("two_modifiers"))
as_two_modifiers.location = location
return as_two_modifiers
if from_parent: # only one nesting level
return None
_simple_nest_pattern = rf"(?P<location>{_simple_location_pattern})s*(?(?P<rest>.+))?"
def simple_nest(y: str):
if m := re.fullmatch(_simple_nest_pattern, y):
location = simple_location(m.group("location"))
rest = f(m.group("rest"), from_parent=True)
if rest is not None:
return Item(location=location, NEST=[rest])
if as_simple_nest := simple_nest(x):
return as_simple_nest
@pytest.mark.parametrize("input_, desired", cases, ids=[x for x, _ in cases])
def test_location_parser(input_: str, desired: Item):
result = f(input_)
TestCase().assertDictEqual(result.dict(), desired.dict())
5