SuperCat's Home

今日も頑張ろね~

使用GLib的正则表达式实现CUE文件的解析

| Comments

在ISO C标准中,C语言并没有处理正则表达式的能力,只在POSIX C中被实现了,跨平台的能力受到了限制(虽然有Cygwin这样的东西)。而众所周知C处理字符串是比较费事的,没有正则表达式的帮助,处理字符串是一大难题。这也是很多人转向Perl, Python等语言的原因。而在GLib中,其实现了自己的一套正则表达式处理的函数,为C的跨平台开发提供了更多的便利。

本例子中,需要对CUE Sheet文件进行解析,其文件格式如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
REM GENRE "Electronica"
REM DATE "1998"
PERFORMER "Faithless"
TITLE "Live in Berlin"
FILE "Faithless - Live in Berlin.mp3" MP3
  TRACK 01 AUDIO
    TITLE "Reverence"
    PERFORMER "Faithless"
    INDEX 01 00:00:00
  TRACK 02 AUDIO
    TITLE "She's My Baby"
    PERFORMER "Faithless"
    INDEX 01 06:42:00
  TRACK 03 AUDIO
    TITLE "Take the Long Way Home"
    PERFORMER "Faithless"
    INDEX 01 10:54:00
  TRACK 04 AUDIO
    TITLE "Insomnia"
    PERFORMER "Faithless"
    INDEX 01 17:04:00
  TRACK 05 AUDIO
    TITLE "Bring the Family Back"
    PERFORMER "Faithless"
    INDEX 01 25:44:00
  TRACK 06 AUDIO
    TITLE "Salva Mea"
    PERFORMER "Faithless"
    INDEX 01 30:50:00
  TRACK 07 AUDIO
    TITLE "Dirty Old Man"
    PERFORMER "Faithless"
    INDEX 01 38:24:00
  TRACK 08 AUDIO
    TITLE "God Is a DJ"
    PERFORMER "Faithless"
    INDEX 01 42:35:00

其中,需要获得CUE的FILE, PERFORMER, TITLE,以及各个轨的TITLE, PERFORMER, INDEX属性,通过使用GLib的正则表达式,匹配并获取相应的字符串,进而读出相应属性的值。

首先,全文搜索FILE属性,如果在文本中搜索到该标签,就获取它的值:

CUE解析器
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
typedef enum RCCueInputType
{
    RC_CUE_INPUT_URI = 0,
    RC_CUE_INPUT_PATH = 1,
    RC_CUE_INPUT_EMBEDED = 2
}RCCueInputType;

typedef struct RCCueTrack {
    guint index;
    gchar *title;
    gchar *performer;
    guint64 time0;
    guint64 time1;
}RCCueTrack;

typedef struct RCCueData
{
    RCCueInputType type;
    gchar *file;
    gchar *performer;
    gchar *title;
    guint length;
    RCCueTrack *track;
}RCCueData;

/* ... */


/**
 * rc_cue_read_data:
 * @input: the input data
 * @type: the data type of the input data
 * @data: the parsed CUE data
 *
 * Read and parse data from CUE file or string.
 *
 * Returns: The track number, 0 if the CUE data is incorrect.
 */

guint rc_cue_read_data(const gchar *input, RCCueInputType type,
    RCCueData *data)
{
    const gchar *locale = NULL;
    gchar *path = NULL;
    gchar *ex_encoding = NULL;
    gchar *buf = NULL, *line = NULL;
    gchar *dir = NULL;
    gchar *tmp = NULL;
    gchar *cue_raw_data = NULL, *cue_tmp_data = NULL, *cue_new_data = NULL;
    gsize cue_raw_length = 0, cue_tmp_length = 0, cue_new_length = 0;
    gint track_index, track_sm, track_ss, track_sd;
    guint64 track_time;
    gchar **line_data_array = NULL;
    gboolean flag;
    guint i = 0;
    gchar chr;
    guint track_num = 0;
    GSList *track_list = NULL, *list_foreach = NULL;
    GRegex *music_filename_regex;
    GRegex *data_regex;
    GMatchInfo *match_info;
    RCCueTrack *cue_track_data = NULL, *cue_track_array = NULL;
    if(input==NULL) return 0;
    if(data==NULL) return 0;
    switch(type)
    {
        case RC_CUE_INPUT_URI:
            path = g_filename_from_uri(input, NULL, NULL);
            if(path==NULL) return 0;
        case RC_CUE_INPUT_PATH:
            if(path==NULL) path = g_strdup(input);
            if(path==NULL) return 0;
            flag = g_file_get_contents(path, &cue_raw_data,
                &cue_raw_length, NULL);
            dir = g_path_get_dirname(path);
            g_free(path);
            if(!flag)
            {
                g_free(dir);
                return 0;
            }
        case RC_CUE_INPUT_EMBEDED:
            break;
        default:
            return 0;
    }
    if(rc_set_get_boolean("Metadata", "AutoEncodingDetect", NULL))
    {
        locale = rc_player_get_locale();
        if(strncmp(locale, "zh_CN", 5)==0)
            ex_encoding = g_strdup("GB18030");
        else if(strncmp(locale, "zh_TW", 5)==0)
            ex_encoding = g_strdup("BIG5");
        else if(strncmp(locale, "ja_JP", 5)==0)
            ex_encoding = g_strdup("ShiftJIS");
        else
            ex_encoding = rc_set_get_string("Metadata",
                "TagExEncoding", NULL);
    }
    else
        ex_encoding = rc_set_get_string("Metadata", "TagExEncoding", NULL);
    if(ex_encoding==NULL) ex_encoding = g_strdup("GBK");
    if(cue_raw_data==NULL)
    {
        if(!g_utf8_validate(input, -1, NULL))
            cue_tmp_data = g_convert(input, -1, "UTF-8", ex_encoding,
                NULL, NULL, NULL);
        else
            cue_tmp_data = g_strdup(input);
    }
    else
    {
        if(!g_utf8_validate(cue_raw_data, -1, NULL))
            cue_tmp_data = g_convert(cue_raw_data, -1, "UTF-8",
                ex_encoding, NULL, NULL, NULL);
        else
            cue_tmp_data = g_strdup(cue_raw_data);
        g_free(cue_raw_data);
    }
    if(ex_encoding!=NULL) g_free(ex_encoding);
    if(cue_tmp_data==NULL)
    {
        if(dir!=NULL) g_free(dir);
        return 0;
    }
    cue_tmp_length = strlen(cue_tmp_data);
    cue_new_data = g_malloc0(sizeof(gchar) * cue_tmp_length);
    for(i=0;i<cue_tmp_length;i++)
    {
        chr = cue_tmp_data[i];
        if(chr!='\r')
        {
            cue_new_data[cue_new_length] = chr;
            cue_new_length++;
        }
        else if(i+1<cue_tmp_length &amp;&amp; cue_new_data[i+1]!='\n')
        {
            cue_new_data[cue_new_length] = '\n';
            cue_new_length++;
        }
    }
    g_free(cue_tmp_data);
    bzero(data, sizeof(RCCueData));
    data->type = type;
    if(type!=RC_CUE_INPUT_EMBEDED)
    {
        music_filename_regex = g_regex_new("(FILE \").*[\"]",
            G_REGEX_CASELESS, 0, NULL);
        g_regex_match(music_filename_regex, cue_new_data, 0, &amp;match_info);
        if(g_match_info_matches(match_info))
        {
            buf = g_match_info_fetch(match_info, 0);
            if(dir!=NULL)
            {
                path = g_strndup(buf+6, strlen(buf)-7);
                if(type==RC_CUE_INPUT_URI)
                {
                    tmp = g_build_filename(dir, path, NULL);
                    data->file = g_filename_to_uri(tmp, NULL, NULL);
                    g_free(tmp);
                }
                else
                    data->file = g_build_filename(dir, path, NULL);
                g_free(path);
            }
            else
                data->file = g_strndup(buf+6, strlen(buf)-7);
            g_free(buf);
        }
        g_match_info_free(match_info);
        g_regex_unref(music_filename_regex);
        if(dir!=NULL) g_free(dir);
        if(data->file==NULL)
        {
            g_free(cue_new_data);
            return 0;
        }
    }
    else
        data->file = NULL;
    data_regex = g_regex_new("\".*[^\"]", G_REGEX_CASELESS, 0, NULL);
    line_data_array = g_strsplit(cue_new_data, "\n", 0);
    for(i=0;line_data_array[i]!=NULL;i++)
    {
        line = line_data_array[i];
        if(g_regex_match_simple("(TRACK )[0-9]+( AUDIO)", line,
            G_REGEX_CASELESS, 0))
        {
            track_num++;
            cue_track_data = g_malloc0(sizeof(RCCueTrack));
            sscanf(line, "%*s%d", &amp;(cue_track_data->index));
            track_list = g_slist_append(track_list, cue_track_data);
        }
        else if(cue_track_data!=NULL &amp;&amp; g_regex_match_simple("(INDEX )[0-9]+ "
            "[0-9]+:[0-9]{2}:[0-9]{2}", line, G_REGEX_CASELESS, 0))
        {
            sscanf(line, "%*s%d %d:%d:%d", &amp;track_index, &amp;track_sm,
                &amp;track_ss, &amp;track_sd);
            track_time = (track_sm * 60 + track_ss) * 1000 + 10 * track_sd;
            track_time *= GST_MSECOND;
            if(track_index==0)
                cue_track_data->time0 = track_time;
            else if(track_index==1)
                cue_track_data->time1 = track_time;
        }
        else if(g_regex_match_simple("(TITLE \").*[\"]", line,
            G_REGEX_CASELESS, 0))
        {
            g_regex_match(data_regex, line, 0, &amp;match_info);
            if(g_match_info_matches(match_info))
            {
                buf = g_match_info_fetch(match_info, 0);
                if(buf!=NULL &amp;&amp; strlen(buf)>1)
                {
                    if(cue_track_data!=NULL)
                    {
                        if(cue_track_data->title!=NULL)
                            g_free(cue_track_data->title);
                        cue_track_data->title = g_strdup(buf+1);
                    }
                    else
                    {
                        if(data->title!=NULL) g_free(data->title);
                        data->title = g_strdup(buf+1);
                    }
                }
                if(buf!=NULL) g_free(buf);
            }
            g_match_info_free(match_info);
        }
        else if(g_regex_match_simple("(PERFORMER \").*[\"]", line,
            G_REGEX_CASELESS, 0))
        {
            g_regex_match(data_regex, line, 0, &amp;match_info);
            if(g_match_info_matches(match_info))
            {
                buf = g_match_info_fetch(match_info, 0);
                if(buf!=NULL &amp;&amp; strlen(buf)>1)
                {
                    if(cue_track_data!=NULL)
                    {
                        if(cue_track_data->performer!=NULL)
                            g_free(cue_track_data->performer);
                        cue_track_data->performer = g_strdup(buf+1);
                    }
                    else
                    {
                        if(data->performer!=NULL) g_free(data->performer);
                        data->performer = g_strdup(buf+1);
                    }
                }
                if(buf!=NULL) g_free(buf);
            }
            g_match_info_free(match_info);
        }
    }
    g_strfreev(line_data_array);
    g_free(cue_new_data);
    g_regex_unref(data_regex);
    i = 0;
    cue_track_array = g_malloc0(sizeof(RCCueTrack) * track_num);
    for(list_foreach=track_list;list_foreach!=NULL;
        list_foreach=g_slist_next(list_foreach))
    {
        memcpy(cue_track_array+i, list_foreach->data, sizeof(RCCueTrack));
        g_free(list_foreach->data);
        i++;
    }
    g_slist_free(track_list);
    data->track = cue_track_array;
    data->length = track_num;
    return track_num;
}

要进行正则表达式的匹配首先要初始化GRegex类型,使用g_regex_new()即可建立一个GRegex类型的指针,其用法可在GLib的参考手册中查阅,此处不做过多说明(下同)。然后使用g_regex_match()函数在所指定的文本缓冲区中进行搜索,搜索的结果会放到GMatchInfo类型中,通过g_match_info_matches()函数可以得知其是否找到了符合表达式的文本,通过g_match_info_fetch()函数即可获得相应的文本。

接着逐行读取CUE文件,获得TITLE, PERFORMER信息,以及每个轨道的TITLE, PERFORMER, INDEX信息。 使用GLib中的g_strsplit()函数即可将文本分割成行,对每行都进行相应的正则表达式匹配,如果满足匹配条件,就获取它的值,并且保存到相应的数据结构中。

上面的例子中,针对CUE的格式,使用GLib提供的正则表达式对其进行解析。对于其它的简单的文本格式,也可以使用正则表达式进行解析。希望本文能够抛砖引玉,让大家能更加了解GLib中提供的各个函数,为自己的开发提供更多的帮助。

Comments